1 // Mozilla has modified this file - see https://hg.mozilla.org/ for details.
2 /*
3  * Licensed to the Apache Software Foundation (ASF) under one or more
4  * contributor license agreements.  See the NOTICE file distributed with
5  * this work for additional information regarding copyright ownership.
6  * The ASF licenses this file to You under the Apache License, Version 2.0
7  * (the "License"); you may not use this file except in compliance with
8  * the License.  You may obtain a copy of the License at
9  *
10  *      http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 package org.mozilla.apache.commons.codec.binary;
20 
21 /**
22  * Provides Base32 encoding and decoding as defined by <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>.
23  *
24  * <p>
25  * The class can be parameterized in the following manner with various constructors:
26  * <ul>
27  * <li>Whether to use the "base32hex" variant instead of the default "base32"</li>
28  * <li>Line length: Default 76. Line length that aren't multiples of 8 will still essentially end up being multiples of
29  * 8 in the encoded data.
30  * <li>Line separator: Default is CRLF ("\r\n")</li>
31  * </ul>
32  * </p>
33  * <p>
34  * This class operates directly on byte streams, and not character streams.
35  * </p>
36  * <p>
37  * This class is not thread-safe. Each thread should use its own instance.
38  * </p>
39  *
40  * @see <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>
41  *
42  * @since 1.5
43  * @version $Revision: 1080712 $
44  */
45 public class Base32 extends BaseNCodec {
46 
47     /**
48      * BASE32 characters are 5 bits in length.
49      * They are formed by taking a block of five octets to form a 40-bit string,
50      * which is converted into eight BASE32 characters.
51      */
52     private static final int BITS_PER_ENCODED_BYTE = 5;
53     private static final int BYTES_PER_ENCODED_BLOCK = 8;
54     private static final int BYTES_PER_UNENCODED_BLOCK = 5;
55 
56     /**
57      * Chunk separator per RFC 2045 section 2.1.
58      *
59      * @see <a href="http://www.ietf.org/rfc/rfc2045.txt">RFC 2045 section 2.1</a>
60      */
61     private static final byte[] CHUNK_SEPARATOR = {'\r', '\n'};
62 
63     /**
64      * This array is a lookup table that translates Unicode characters drawn from the "Base32 Alphabet" (as specified in
65      * Table 3 of RFC 2045) into their 5-bit positive integer equivalents. Characters that are not in the Base32
66      * alphabet but fall within the bounds of the array are translated to -1.
67      *
68      */
69     private static final byte[] DECODE_TABLE = {
70          //  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
71             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
72             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
73             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 63, // 20-2f
74             -1, -1, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
75             -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, // 40-4f A-N
76             15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,                     // 50-5a O-Z
77     };
78 
79     /**
80      * This array is a lookup table that translates 5-bit positive integer index values into their "Base32 Alphabet"
81      * equivalents as specified in Table 3 of RFC 2045.
82      */
83     private static final byte[] ENCODE_TABLE = {
84             'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
85             'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
86             '2', '3', '4', '5', '6', '7',
87     };
88 
89     /**
90      * This array is a lookup table that translates Unicode characters drawn from the "Base32 |Hex Alphabet" (as specified in
91      * Table 3 of RFC 2045) into their 5-bit positive integer equivalents. Characters that are not in the Base32 Hex
92      * alphabet but fall within the bounds of the array are translated to -1.
93      *
94      */
95     private static final byte[] HEX_DECODE_TABLE = {
96          //  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
97             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
98             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
99             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 63, // 20-2f
100              0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
101             -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, // 40-4f A-N
102             25, 26, 27, 28, 29, 30, 31, 32,                                 // 50-57 O-V
103     };
104 
105     /**
106      * This array is a lookup table that translates 5-bit positive integer index values into their "Base32 Hex Alphabet"
107      * equivalents as specified in Table 3 of RFC 2045.
108      */
109     private static final byte[] HEX_ENCODE_TABLE = {
110             '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
111             'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
112             'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
113     };
114 
115     /** Mask used to extract 5 bits, used when encoding Base32 bytes */
116     private static final int MASK_5BITS = 0x1f;
117 
118     // The static final fields above are used for the original static byte[] methods on Base32.
119     // The private member fields below are used with the new streaming approach, which requires
120     // some state be preserved between calls of encode() and decode().
121 
122     /**
123      * Place holder for the bytes we're dealing with for our based logic.
124      * Bitwise operations store and extract the encoding or decoding from this variable.
125      */
126     private long bitWorkArea;
127 
128     /**
129      * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
130      * <code>decodeSize = {@link BYTES_PER_ENCODED_BLOCK} - 1 + lineSeparator.length;</code>
131      */
132     private final int decodeSize;
133 
134     /**
135      * Decode table to use.
136      */
137     private final byte[] decodeTable;
138 
139     /**
140      * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
141      * <code>encodeSize = {@link BYTES_PER_ENCODED_BLOCK} + lineSeparator.length;</code>
142      */
143     private final int encodeSize;
144 
145     /**
146      * Encode table to use.
147      */
148     private final byte[] encodeTable;
149 
150     /**
151      * Line separator for encoding. Not used when decoding. Only used if lineLength > 0.
152      */
153     private final byte[] lineSeparator;
154 
155     /**
156      * Creates a Base32 codec used for decoding and encoding.
157      * <p>
158      * When encoding the line length is 0 (no chunking).
159      * </p>
160      *
161      */
Base32()162     public Base32() {
163         this(false);
164     }
165 
166     /**
167      * Creates a Base32 codec used for decoding and encoding.
168      * <p>
169      * When encoding the line length is 0 (no chunking).
170      * </p>
171      * @param useHex if <code>true</code> then use Base32 Hex alphabet
172      */
Base32(boolean useHex)173     public Base32(boolean useHex) {
174         this(0, null, useHex);
175     }
176 
177     /**
178      * Creates a Base32 codec used for decoding and encoding.
179      * <p>
180      * When encoding the line length is given in the constructor, the line separator is CRLF.
181      * </p>
182      *
183      * @param lineLength
184      *            Each line of encoded data will be at most of the given length (rounded down to nearest multiple of 8).
185      *            If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when decoding.
186      */
Base32(int lineLength)187     public Base32(int lineLength) {
188         this(lineLength, CHUNK_SEPARATOR);
189     }
190 
191     /**
192      * Creates a Base32 codec used for decoding and encoding.
193      * <p>
194      * When encoding the line length and line separator are given in the constructor.
195      * </p>
196      * <p>
197      * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
198      * </p>
199      *
200      * @param lineLength
201      *            Each line of encoded data will be at most of the given length (rounded down to nearest multiple of 8).
202      *            If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when decoding.
203      * @param lineSeparator
204      *            Each line of encoded data will end with this sequence of bytes.
205      * @throws IllegalArgumentException
206      *             The provided lineSeparator included some Base32 characters. That's not going to work!
207      */
Base32(int lineLength, byte[] lineSeparator)208     public Base32(int lineLength, byte[] lineSeparator) {
209         this(lineLength, lineSeparator, false);
210     }
211 
212     /**
213      * Creates a Base32 / Base32 Hex codec used for decoding and encoding.
214      * <p>
215      * When encoding the line length and line separator are given in the constructor.
216      * </p>
217      * <p>
218      * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
219      * </p>
220      *
221      * @param lineLength
222      *            Each line of encoded data will be at most of the given length (rounded down to nearest multiple of 8).
223      *            If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when decoding.
224      * @param lineSeparator
225      *            Each line of encoded data will end with this sequence of bytes.
226      * @param useHex if <code>true</code>, then use Base32 Hex alphabet, otherwise use Base32 alphabet
227      * @throws IllegalArgumentException
228      *             The provided lineSeparator included some Base32 characters. That's not going to work!
229      *             Or the lineLength > 0 and lineSeparator is null.
230      */
Base32(int lineLength, byte[] lineSeparator, boolean useHex)231     public Base32(int lineLength, byte[] lineSeparator, boolean useHex) {
232         super(BYTES_PER_UNENCODED_BLOCK, BYTES_PER_ENCODED_BLOCK,
233                 lineLength,
234                 lineSeparator == null ? 0 : lineSeparator.length);
235         if (useHex){
236             this.encodeTable = HEX_ENCODE_TABLE;
237             this.decodeTable = HEX_DECODE_TABLE;
238         } else {
239             this.encodeTable = ENCODE_TABLE;
240             this.decodeTable = DECODE_TABLE;
241         }
242         if (lineLength > 0) {
243             if (lineSeparator == null) {
244                 throw new IllegalArgumentException("lineLength "+lineLength+" > 0, but lineSeparator is null");
245             }
246             // Must be done after initializing the tables
247             if (containsAlphabetOrPad(lineSeparator)) {
248                 String sep = StringUtils.newStringUtf8(lineSeparator);
249                 throw new IllegalArgumentException("lineSeparator must not contain Base32 characters: [" + sep + "]");
250             }
251             this.encodeSize = BYTES_PER_ENCODED_BLOCK + lineSeparator.length;
252             this.lineSeparator = new byte[lineSeparator.length];
253             System.arraycopy(lineSeparator, 0, this.lineSeparator, 0, lineSeparator.length);
254         } else {
255             this.encodeSize = BYTES_PER_ENCODED_BLOCK;
256             this.lineSeparator = null;
257         }
258         this.decodeSize = this.encodeSize - 1;
259     }
260 
261     /**
262      * <p>
263      * Decodes all of the provided data, starting at inPos, for inAvail bytes. Should be called at least twice: once
264      * with the data to decode, and once with inAvail set to "-1" to alert decoder that EOF has been reached. The "-1"
265      * call is not necessary when decoding, but it doesn't hurt, either.
266      * </p>
267      * <p>
268      * Ignores all non-Base32 characters. This is how chunked (e.g. 76 character) data is handled, since CR and LF are
269      * silently ignored, but has implications for other bytes, too. This method subscribes to the garbage-in,
270      * garbage-out philosophy: it will not check the provided data for validity.
271      * </p>
272      *
273      * @param in
274      *            byte[] array of ascii data to Base32 decode.
275      * @param inPos
276      *            Position to start reading data from.
277      * @param inAvail
278      *            Amount of bytes available from input for encoding.
279      *
280      * Output is written to {@link #buffer} as 8-bit octets, using {@link pos} as the buffer position
281      */
decode(byte[] in, int inPos, int inAvail)282     void decode(byte[] in, int inPos, int inAvail) { // package protected for access from I/O streams
283         if (eof) {
284             return;
285         }
286         if (inAvail < 0) {
287             eof = true;
288         }
289         for (int i = 0; i < inAvail; i++) {
290             byte b = in[inPos++];
291             if (b == PAD) {
292                 // We're done.
293                 eof = true;
294                 break;
295             } else {
296                 ensureBufferSize(decodeSize);
297                 if (b >= 0 && b < this.decodeTable.length) {
298                     int result = this.decodeTable[b];
299                     if (result >= 0) {
300                         modulus = (modulus+1) % BYTES_PER_ENCODED_BLOCK;
301                         bitWorkArea = (bitWorkArea << BITS_PER_ENCODED_BYTE) + result; // collect decoded bytes
302                         if (modulus == 0) { // we can output the 5 bytes
303                             buffer[pos++] = (byte) ((bitWorkArea >> 32) & MASK_8BITS);
304                             buffer[pos++] = (byte) ((bitWorkArea >> 24) & MASK_8BITS);
305                             buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
306                             buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
307                             buffer[pos++] = (byte) (bitWorkArea & MASK_8BITS);
308                         }
309                     }
310                 }
311             }
312         }
313 
314         // Two forms of EOF as far as Base32 decoder is concerned: actual
315         // EOF (-1) and first time '=' character is encountered in stream.
316         // This approach makes the '=' padding characters completely optional.
317         if (eof && modulus >= 2) { // if modulus < 2, nothing to do
318             ensureBufferSize(decodeSize);
319 
320             //  we ignore partial bytes, i.e. only multiples of 8 count
321             switch (modulus) {
322                 case 2 : // 10 bits, drop 2 and output one byte
323                     buffer[pos++] = (byte) ((bitWorkArea >> 2) & MASK_8BITS);
324                     break;
325                 case 3 : // 15 bits, drop 7 and output 1 byte
326                     buffer[pos++] = (byte) ((bitWorkArea >> 7) & MASK_8BITS);
327                     break;
328                 case 4 : // 20 bits = 2*8 + 4
329                     bitWorkArea = bitWorkArea >> 4; // drop 4 bits
330                     buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
331                     buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
332                     break;
333                 case 5 : // 25bits = 3*8 + 1
334                     bitWorkArea = bitWorkArea >> 1;
335                     buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
336                     buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
337                     buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
338                     break;
339                 case 6 : // 30bits = 3*8 + 6
340                     bitWorkArea = bitWorkArea >> 6;
341                     buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
342                     buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
343                     buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
344                     break;
345                 case 7 : // 35 = 4*8 +3
346                     bitWorkArea = bitWorkArea >> 3;
347                     buffer[pos++] = (byte) ((bitWorkArea >> 24) & MASK_8BITS);
348                     buffer[pos++] = (byte) ((bitWorkArea >> 16) & MASK_8BITS);
349                     buffer[pos++] = (byte) ((bitWorkArea >> 8) & MASK_8BITS);
350                     buffer[pos++] = (byte) ((bitWorkArea) & MASK_8BITS);
351                     break;
352             }
353         }
354     }
355 
356     /**
357      * <p>
358      * Encodes all of the provided data, starting at inPos, for inAvail bytes. Must be called at least twice: once with
359      * the data to encode, and once with inAvail set to "-1" to alert encoder that EOF has been reached, so flush last
360      * remaining bytes (if not multiple of 5).
361      * </p>
362      *
363      * @param in
364      *            byte[] array of binary data to Base32 encode.
365      * @param inPos
366      *            Position to start reading data from.
367      * @param inAvail
368      *            Amount of bytes available from input for encoding.
369      */
encode(byte[] in, int inPos, int inAvail)370     void encode(byte[] in, int inPos, int inAvail) { // package protected for access from I/O streams
371         if (eof) {
372             return;
373         }
374         // inAvail < 0 is how we're informed of EOF in the underlying data we're
375         // encoding.
376         if (inAvail < 0) {
377             eof = true;
378             if (0 == modulus && lineLength == 0) {
379                 return; // no leftovers to process and not using chunking
380             }
381             ensureBufferSize(encodeSize);
382             int savedPos = pos;
383             switch (modulus) { // % 5
384                 case 1 : // Only 1 octet; take top 5 bits then remainder
385                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 3) & MASK_5BITS]; // 8-1*5 = 3
386                     buffer[pos++] = encodeTable[(int)(bitWorkArea << 2) & MASK_5BITS]; // 5-3=2
387                     buffer[pos++] = PAD;
388                     buffer[pos++] = PAD;
389                     buffer[pos++] = PAD;
390                     buffer[pos++] = PAD;
391                     buffer[pos++] = PAD;
392                     buffer[pos++] = PAD;
393                     break;
394 
395                 case 2 : // 2 octets = 16 bits to use
396                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 11) & MASK_5BITS]; // 16-1*5 = 11
397                     buffer[pos++] = encodeTable[(int)(bitWorkArea >>  6) & MASK_5BITS]; // 16-2*5 = 6
398                     buffer[pos++] = encodeTable[(int)(bitWorkArea >>  1) & MASK_5BITS]; // 16-3*5 = 1
399                     buffer[pos++] = encodeTable[(int)(bitWorkArea <<  4) & MASK_5BITS]; // 5-1 = 4
400                     buffer[pos++] = PAD;
401                     buffer[pos++] = PAD;
402                     buffer[pos++] = PAD;
403                     buffer[pos++] = PAD;
404                     break;
405                 case 3 : // 3 octets = 24 bits to use
406                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 19) & MASK_5BITS]; // 24-1*5 = 19
407                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 14) & MASK_5BITS]; // 24-2*5 = 14
408                     buffer[pos++] = encodeTable[(int)(bitWorkArea >>  9) & MASK_5BITS]; // 24-3*5 = 9
409                     buffer[pos++] = encodeTable[(int)(bitWorkArea >>  4) & MASK_5BITS]; // 24-4*5 = 4
410                     buffer[pos++] = encodeTable[(int)(bitWorkArea <<  1) & MASK_5BITS]; // 5-4 = 1
411                     buffer[pos++] = PAD;
412                     buffer[pos++] = PAD;
413                     buffer[pos++] = PAD;
414                     break;
415                 case 4 : // 4 octets = 32 bits to use
416                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 27) & MASK_5BITS]; // 32-1*5 = 27
417                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 22) & MASK_5BITS]; // 32-2*5 = 22
418                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 17) & MASK_5BITS]; // 32-3*5 = 17
419                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 12) & MASK_5BITS]; // 32-4*5 = 12
420                     buffer[pos++] = encodeTable[(int)(bitWorkArea >>  7) & MASK_5BITS]; // 32-5*5 =  7
421                     buffer[pos++] = encodeTable[(int)(bitWorkArea >>  2) & MASK_5BITS]; // 32-6*5 =  2
422                     buffer[pos++] = encodeTable[(int)(bitWorkArea <<  3) & MASK_5BITS]; // 5-2 = 3
423                     buffer[pos++] = PAD;
424                     break;
425             }
426             currentLinePos += pos - savedPos; // keep track of current line position
427             // if currentPos == 0 we are at the start of a line, so don't add CRLF
428             if (lineLength > 0 && currentLinePos > 0){ // add chunk separator if required
429                 System.arraycopy(lineSeparator, 0, buffer, pos, lineSeparator.length);
430                 pos += lineSeparator.length;
431             }
432         } else {
433             for (int i = 0; i < inAvail; i++) {
434                 ensureBufferSize(encodeSize);
435                 modulus = (modulus+1) % BYTES_PER_UNENCODED_BLOCK;
436                 int b = in[inPos++];
437                 if (b < 0) {
438                     b += 256;
439                 }
440                 bitWorkArea = (bitWorkArea << 8) + b; // BITS_PER_BYTE
441                 if (0 == modulus) { // we have enough bytes to create our output
442                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 35) & MASK_5BITS];
443                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 30) & MASK_5BITS];
444                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 25) & MASK_5BITS];
445                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 20) & MASK_5BITS];
446                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 15) & MASK_5BITS];
447                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 10) & MASK_5BITS];
448                     buffer[pos++] = encodeTable[(int)(bitWorkArea >> 5) & MASK_5BITS];
449                     buffer[pos++] = encodeTable[(int)bitWorkArea & MASK_5BITS];
450                     currentLinePos += BYTES_PER_ENCODED_BLOCK;
451                     if (lineLength > 0 && lineLength <= currentLinePos) {
452                         System.arraycopy(lineSeparator, 0, buffer, pos, lineSeparator.length);
453                         pos += lineSeparator.length;
454                         currentLinePos = 0;
455                     }
456                 }
457             }
458         }
459     }
460 
461     /**
462      * Returns whether or not the <code>octet</code> is in the Base32 alphabet.
463      *
464      * @param octet
465      *            The value to test
466      * @return <code>true</code> if the value is defined in the the Base32 alphabet <code>false</code> otherwise.
467      */
isInAlphabet(byte octet)468     public boolean isInAlphabet(byte octet) {
469         return octet >= 0 && octet < decodeTable.length && decodeTable[octet] != -1;
470     }
471 }
472