1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnv_u7.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2002jul01
14 * created by: Markus W. Scherer
15 *
16 * UTF-7 converter implementation. Used to be in ucnv_utf.c.
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22
23 #include "cmemory.h"
24 #include "unicode/ucnv.h"
25 #include "ucnv_bld.h"
26 #include "ucnv_cnv.h"
27 #include "uassert.h"
28
29 /* UTF-7 -------------------------------------------------------------------- */
30
31 /*
32 * UTF-7 is a stateful encoding of Unicode.
33 * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
34 * It was intended for use in Internet email systems, using in its bytewise
35 * encoding only a subset of 7-bit US-ASCII.
36 * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
37 * occasionally used.
38 *
39 * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
40 * characters directly or in base64. Especially, the characters in set O
41 * as defined in the RFC (see below) may be encoded directly but are not
42 * allowed in, e.g., email headers.
43 * By default, the ICU UTF-7 converter encodes set O directly.
44 * By choosing the option "version=1", set O will be escaped instead.
45 * For example:
46 * utf7Converter=ucnv_open("UTF-7,version=1");
47 *
48 * For details about email headers see RFC 2047.
49 */
50
51 /*
52 * Tests for US-ASCII characters belonging to character classes
53 * defined in UTF-7.
54 *
55 * Set D (directly encoded characters) consists of the following
56 * characters: the upper and lower case letters A through Z
57 * and a through z, the 10 digits 0-9, and the following nine special
58 * characters (note that "+" and "=" are omitted):
59 * '(),-./:?
60 *
61 * Set O (optional direct characters) consists of the following
62 * characters (note that "\" and "~" are omitted):
63 * !"#$%&*;<=>@[]^_`{|}
64 *
65 * According to the rules in RFC 2152, the byte values for the following
66 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
67 * - all C0 control codes except for CR LF TAB
68 * - BACKSLASH
69 * - TILDE
70 * - DEL
71 * - all codes beyond US-ASCII, i.e. all >127
72 */
73 #define inSetD(c) \
74 ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
75 (uint8_t)((c)-48)<10 || /* digits */ \
76 (uint8_t)((c)-39)<3 || /* '() */ \
77 (uint8_t)((c)-44)<4 || /* ,-./ */ \
78 (c)==58 || (c)==63 /* :? */ \
79 )
80
81 #define inSetO(c) \
82 ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
83 (uint8_t)((c)-59)<4 || /* ;<=> */ \
84 (uint8_t)((c)-93)<4 || /* ]^_` */ \
85 (uint8_t)((c)-123)<3 || /* {|} */ \
86 (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
87 )
88
89 #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
90 #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
91
92 #define PLUS 43
93 #define MINUS 45
94 #define BACKSLASH 92
95 #define TILDE 126
96
97 /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
98 #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
99
100 /* encode directly sets D and O and CR LF SP TAB */
101 static const UBool encodeDirectlyMaximum[128]={
102 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
111
112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
114 };
115
116 /* encode directly set D and CR LF SP TAB but not set O */
117 static const UBool encodeDirectlyRestricted[128]={
118 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121
122 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
124
125 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
127
128 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
130 };
131
132 static const uint8_t
133 toBase64[64]={
134 /* A-Z */
135 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
136 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
137 /* a-z */
138 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
139 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
140 /* 0-9 */
141 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
142 /* +/ */
143 43, 47
144 };
145
146 static const int8_t
147 fromBase64[128]={
148 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
149 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
150 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
151
152 /* general punctuation with + and / and a special value (-2) for - */
153 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
154 /* digits */
155 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
156
157 /* A-Z */
158 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
159 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
160
161 /* a-z */
162 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
163 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
164 };
165
166 /*
167 * converter status values:
168 *
169 * toUnicodeStatus:
170 * 24 inDirectMode (boolean)
171 * 23..16 base64Counter (-1..7)
172 * 15..0 bits (up to 14 bits incoming base64)
173 *
174 * fromUnicodeStatus:
175 * 31..28 version (0: set O direct 1: set O escaped)
176 * 24 inDirectMode (boolean)
177 * 23..16 base64Counter (0..2)
178 * 7..0 bits (6 bits outgoing base64)
179 *
180 */
181
182 static void
_UTF7Reset(UConverter * cnv,UConverterResetChoice choice)183 _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
184 if(choice<=UCNV_RESET_TO_UNICODE) {
185 /* reset toUnicode */
186 cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
187 cnv->toULength=0;
188 }
189 if(choice!=UCNV_RESET_TO_UNICODE) {
190 /* reset fromUnicode */
191 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
192 }
193 }
194
195 static void
_UTF7Open(UConverter * cnv,UConverterLoadArgs * pArgs,UErrorCode * pErrorCode)196 _UTF7Open(UConverter *cnv,
197 UConverterLoadArgs *pArgs,
198 UErrorCode *pErrorCode) {
199 if(UCNV_GET_VERSION(cnv)<=1) {
200 /* TODO(markus): Should just use cnv->options rather than copying the version number. */
201 cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
202 _UTF7Reset(cnv, UCNV_RESET_BOTH);
203 } else {
204 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
205 }
206 }
207
208 static void
_UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)209 _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
210 UErrorCode *pErrorCode) {
211 UConverter *cnv;
212 const uint8_t *source, *sourceLimit;
213 UChar *target;
214 const UChar *targetLimit;
215 int32_t *offsets;
216
217 uint8_t *bytes;
218 uint8_t byteIndex;
219
220 int32_t length, targetCapacity;
221
222 /* UTF-7 state */
223 uint16_t bits;
224 int8_t base64Counter;
225 UBool inDirectMode;
226
227 int8_t base64Value;
228
229 int32_t sourceIndex, nextSourceIndex;
230
231 uint8_t b;
232 /* set up the local pointers */
233 cnv=pArgs->converter;
234
235 source=(const uint8_t *)pArgs->source;
236 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
237 target=pArgs->target;
238 targetLimit=pArgs->targetLimit;
239 offsets=pArgs->offsets;
240 /* get the state machine state */
241 {
242 uint32_t status=cnv->toUnicodeStatus;
243 inDirectMode=(UBool)((status>>24)&1);
244 base64Counter=(int8_t)(status>>16);
245 bits=(uint16_t)status;
246 }
247 bytes=cnv->toUBytes;
248 byteIndex=cnv->toULength;
249
250 /* sourceIndex=-1 if the current character began in the previous buffer */
251 sourceIndex=byteIndex==0 ? 0 : -1;
252 nextSourceIndex=0;
253
254 if(inDirectMode) {
255 directMode:
256 /*
257 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
258 * with their US-ASCII byte values.
259 * Backslash and Tilde and most control characters are not allowed in UTF-7.
260 * A plus sign starts Unicode (or "escape") Mode.
261 *
262 * In Direct Mode, only the sourceIndex is used.
263 */
264 byteIndex=0;
265 length=(int32_t)(sourceLimit-source);
266 targetCapacity=(int32_t)(targetLimit-target);
267 if(length>targetCapacity) {
268 length=targetCapacity;
269 }
270 while(length>0) {
271 b=*source++;
272 if(!isLegalUTF7(b)) {
273 /* illegal */
274 bytes[0]=b;
275 byteIndex=1;
276 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
277 break;
278 } else if(b!=PLUS) {
279 /* write directly encoded character */
280 *target++=b;
281 if(offsets!=NULL) {
282 *offsets++=sourceIndex++;
283 }
284 } else /* PLUS */ {
285 /* switch to Unicode mode */
286 nextSourceIndex=++sourceIndex;
287 inDirectMode=FALSE;
288 byteIndex=0;
289 bits=0;
290 base64Counter=-1;
291 goto unicodeMode;
292 }
293 --length;
294 }
295 if(source<sourceLimit && target>=targetLimit) {
296 /* target is full */
297 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
298 }
299 } else {
300 unicodeMode:
301 /*
302 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
303 * The base64 sequence ends with any character that is not in the base64 alphabet.
304 * A terminating minus sign is consumed.
305 *
306 * In Unicode Mode, the sourceIndex has the index to the start of the current
307 * base64 bytes, while nextSourceIndex is precisely parallel to source,
308 * keeping the index to the following byte.
309 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
310 */
311 while(source<sourceLimit) {
312 if(target<targetLimit) {
313 bytes[byteIndex++]=b=*source++;
314 ++nextSourceIndex;
315 base64Value = -3; /* initialize as illegal */
316 if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
317 /* either
318 * base64Value==-1 for any legal character except base64 and minus sign, or
319 * base64Value==-3 for illegal characters:
320 * 1. In either case, leave Unicode mode.
321 * 2.1. If we ended with an incomplete UChar or none after the +, then
322 * generate an error for the preceding erroneous sequence and deal with
323 * the current (possibly illegal) character next time through.
324 * 2.2. Else the current char comes after a complete UChar, which was already
325 * pushed to the output buf, so:
326 * 2.2.1. If the current char is legal, just save it for processing next time.
327 * It may be for example, a plus which we need to deal with in direct mode.
328 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
329 */
330 inDirectMode=TRUE;
331 if(base64Counter==-1) {
332 /* illegal: + immediately followed by something other than base64 or minus sign */
333 /* include the plus sign in the reported sequence, but not the subsequent char */
334 --source;
335 bytes[0]=PLUS;
336 byteIndex=1;
337 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
338 break;
339 } else if(bits!=0) {
340 /* bits are illegally left over, a UChar is incomplete */
341 /* don't include current char (legal or illegal) in error seq */
342 --source;
343 --byteIndex;
344 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
345 break;
346 } else {
347 /* previous UChar was complete */
348 if(base64Value==-3) {
349 /* current character is illegal, deal with it here */
350 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
351 break;
352 } else {
353 /* un-read the current character in case it is a plus sign */
354 --source;
355 sourceIndex=nextSourceIndex-1;
356 goto directMode;
357 }
358 }
359 } else if(base64Value>=0) {
360 /* collect base64 bytes into UChars */
361 switch(base64Counter) {
362 case -1: /* -1 is immediately after the + */
363 case 0:
364 bits=base64Value;
365 base64Counter=1;
366 break;
367 case 1:
368 case 3:
369 case 4:
370 case 6:
371 bits=(uint16_t)((bits<<6)|base64Value);
372 ++base64Counter;
373 break;
374 case 2:
375 *target++=(UChar)((bits<<4)|(base64Value>>2));
376 if(offsets!=NULL) {
377 *offsets++=sourceIndex;
378 sourceIndex=nextSourceIndex-1;
379 }
380 bytes[0]=b; /* keep this byte in case an error occurs */
381 byteIndex=1;
382 bits=(uint16_t)(base64Value&3);
383 base64Counter=3;
384 break;
385 case 5:
386 *target++=(UChar)((bits<<2)|(base64Value>>4));
387 if(offsets!=NULL) {
388 *offsets++=sourceIndex;
389 sourceIndex=nextSourceIndex-1;
390 }
391 bytes[0]=b; /* keep this byte in case an error occurs */
392 byteIndex=1;
393 bits=(uint16_t)(base64Value&15);
394 base64Counter=6;
395 break;
396 case 7:
397 *target++=(UChar)((bits<<6)|base64Value);
398 if(offsets!=NULL) {
399 *offsets++=sourceIndex;
400 sourceIndex=nextSourceIndex;
401 }
402 byteIndex=0;
403 bits=0;
404 base64Counter=0;
405 break;
406 default:
407 /* will never occur */
408 break;
409 }
410 } else /*base64Value==-2*/ {
411 /* minus sign terminates the base64 sequence */
412 inDirectMode=TRUE;
413 if(base64Counter==-1) {
414 /* +- i.e. a minus immediately following a plus */
415 *target++=PLUS;
416 if(offsets!=NULL) {
417 *offsets++=sourceIndex-1;
418 }
419 } else {
420 /* absorb the minus and leave the Unicode Mode */
421 if(bits!=0) {
422 /* bits are illegally left over, a UChar is incomplete */
423 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
424 break;
425 }
426 }
427 sourceIndex=nextSourceIndex;
428 goto directMode;
429 }
430 } else {
431 /* target is full */
432 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
433 break;
434 }
435 }
436 }
437
438 if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
439 /*
440 * if we are in Unicode mode, then the byteIndex might not be 0,
441 * but that is ok if bits==0
442 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
443 * (not true for IMAP-mailbox-name where we must end in direct mode)
444 */
445 byteIndex=0;
446 }
447
448 /* set the converter state back into UConverter */
449 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
450 cnv->toULength=byteIndex;
451
452 /* write back the updated pointers */
453 pArgs->source=(const char *)source;
454 pArgs->target=target;
455 pArgs->offsets=offsets;
456 return;
457 }
458
459 static void
_UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)460 _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
461 UErrorCode *pErrorCode) {
462 UConverter *cnv;
463 const UChar *source, *sourceLimit;
464 uint8_t *target, *targetLimit;
465 int32_t *offsets;
466
467 int32_t length, targetCapacity, sourceIndex;
468 UChar c;
469
470 /* UTF-7 state */
471 const UBool *encodeDirectly;
472 uint8_t bits;
473 int8_t base64Counter;
474 UBool inDirectMode;
475
476 /* set up the local pointers */
477 cnv=pArgs->converter;
478
479 /* set up the local pointers */
480 source=pArgs->source;
481 sourceLimit=pArgs->sourceLimit;
482 target=(uint8_t *)pArgs->target;
483 targetLimit=(uint8_t *)pArgs->targetLimit;
484 offsets=pArgs->offsets;
485
486 /* get the state machine state */
487 {
488 uint32_t status=cnv->fromUnicodeStatus;
489 encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
490 inDirectMode=(UBool)((status>>24)&1);
491 base64Counter=(int8_t)(status>>16);
492 bits=(uint8_t)status;
493 U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
494 }
495
496 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
497 sourceIndex=0;
498
499 if(inDirectMode) {
500 directMode:
501 length=(int32_t)(sourceLimit-source);
502 targetCapacity=(int32_t)(targetLimit-target);
503 if(length>targetCapacity) {
504 length=targetCapacity;
505 }
506 while(length>0) {
507 c=*source++;
508 /* currently always encode CR LF SP TAB directly */
509 if(c<=127 && encodeDirectly[c]) {
510 /* encode directly */
511 *target++=(uint8_t)c;
512 if(offsets!=NULL) {
513 *offsets++=sourceIndex++;
514 }
515 } else if(c==PLUS) {
516 /* output +- for + */
517 *target++=PLUS;
518 if(target<targetLimit) {
519 *target++=MINUS;
520 if(offsets!=NULL) {
521 *offsets++=sourceIndex;
522 *offsets++=sourceIndex++;
523 }
524 /* realign length and targetCapacity */
525 goto directMode;
526 } else {
527 if(offsets!=NULL) {
528 *offsets++=sourceIndex++;
529 }
530 cnv->charErrorBuffer[0]=MINUS;
531 cnv->charErrorBufferLength=1;
532 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
533 break;
534 }
535 } else {
536 /* un-read this character and switch to Unicode Mode */
537 --source;
538 *target++=PLUS;
539 if(offsets!=NULL) {
540 *offsets++=sourceIndex;
541 }
542 inDirectMode=FALSE;
543 base64Counter=0;
544 goto unicodeMode;
545 }
546 --length;
547 }
548 if(source<sourceLimit && target>=targetLimit) {
549 /* target is full */
550 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
551 }
552 } else {
553 unicodeMode:
554 while(source<sourceLimit) {
555 if(target<targetLimit) {
556 c=*source++;
557 if(c<=127 && encodeDirectly[c]) {
558 /* encode directly */
559 inDirectMode=TRUE;
560
561 /* trick: back out this character to make this easier */
562 --source;
563
564 /* terminate the base64 sequence */
565 if(base64Counter!=0) {
566 /* write remaining bits for the previous character */
567 *target++=toBase64[bits];
568 if(offsets!=NULL) {
569 *offsets++=sourceIndex-1;
570 }
571 }
572 if(fromBase64[c]!=-1) {
573 /* need to terminate with a minus */
574 if(target<targetLimit) {
575 *target++=MINUS;
576 if(offsets!=NULL) {
577 *offsets++=sourceIndex-1;
578 }
579 } else {
580 cnv->charErrorBuffer[0]=MINUS;
581 cnv->charErrorBufferLength=1;
582 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
583 break;
584 }
585 }
586 goto directMode;
587 } else {
588 /*
589 * base64 this character:
590 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
591 * and the bits of this character, each implicitly in UTF-16BE.
592 *
593 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
594 * character to the next. The actual 2 or 4 bits are shifted to the left edge
595 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
596 */
597 switch(base64Counter) {
598 case 0:
599 *target++=toBase64[c>>10];
600 if(target<targetLimit) {
601 *target++=toBase64[(c>>4)&0x3f];
602 if(offsets!=NULL) {
603 *offsets++=sourceIndex;
604 *offsets++=sourceIndex++;
605 }
606 } else {
607 if(offsets!=NULL) {
608 *offsets++=sourceIndex++;
609 }
610 cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
611 cnv->charErrorBufferLength=1;
612 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
613 }
614 bits=(uint8_t)((c&15)<<2);
615 base64Counter=1;
616 break;
617 case 1:
618 *target++=toBase64[bits|(c>>14)];
619 if(target<targetLimit) {
620 *target++=toBase64[(c>>8)&0x3f];
621 if(target<targetLimit) {
622 *target++=toBase64[(c>>2)&0x3f];
623 if(offsets!=NULL) {
624 *offsets++=sourceIndex;
625 *offsets++=sourceIndex;
626 *offsets++=sourceIndex++;
627 }
628 } else {
629 if(offsets!=NULL) {
630 *offsets++=sourceIndex;
631 *offsets++=sourceIndex++;
632 }
633 cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
634 cnv->charErrorBufferLength=1;
635 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
636 }
637 } else {
638 if(offsets!=NULL) {
639 *offsets++=sourceIndex++;
640 }
641 cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
642 cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
643 cnv->charErrorBufferLength=2;
644 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
645 }
646 bits=(uint8_t)((c&3)<<4);
647 base64Counter=2;
648 break;
649 case 2:
650 *target++=toBase64[bits|(c>>12)];
651 if(target<targetLimit) {
652 *target++=toBase64[(c>>6)&0x3f];
653 if(target<targetLimit) {
654 *target++=toBase64[c&0x3f];
655 if(offsets!=NULL) {
656 *offsets++=sourceIndex;
657 *offsets++=sourceIndex;
658 *offsets++=sourceIndex++;
659 }
660 } else {
661 if(offsets!=NULL) {
662 *offsets++=sourceIndex;
663 *offsets++=sourceIndex++;
664 }
665 cnv->charErrorBuffer[0]=toBase64[c&0x3f];
666 cnv->charErrorBufferLength=1;
667 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
668 }
669 } else {
670 if(offsets!=NULL) {
671 *offsets++=sourceIndex++;
672 }
673 cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
674 cnv->charErrorBuffer[1]=toBase64[c&0x3f];
675 cnv->charErrorBufferLength=2;
676 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
677 }
678 bits=0;
679 base64Counter=0;
680 break;
681 default:
682 /* will never occur */
683 break;
684 }
685 }
686 } else {
687 /* target is full */
688 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
689 break;
690 }
691 }
692 }
693
694 if(pArgs->flush && source>=sourceLimit) {
695 /* flush remaining bits to the target */
696 if(!inDirectMode) {
697 if (base64Counter!=0) {
698 if(target<targetLimit) {
699 *target++=toBase64[bits];
700 if(offsets!=NULL) {
701 *offsets++=sourceIndex-1;
702 }
703 } else {
704 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
705 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
706 }
707 }
708 /* Add final MINUS to terminate unicodeMode */
709 if(target<targetLimit) {
710 *target++=MINUS;
711 if(offsets!=NULL) {
712 *offsets++=sourceIndex-1;
713 }
714 } else {
715 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
716 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
717 }
718 }
719 /* reset the state for the next conversion */
720 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
721 } else {
722 /* set the converter state back into UConverter */
723 cnv->fromUnicodeStatus=
724 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
725 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
726 }
727
728 /* write back the updated pointers */
729 pArgs->source=source;
730 pArgs->target=(char *)target;
731 pArgs->offsets=offsets;
732 return;
733 }
734
735 static const char *
_UTF7GetName(const UConverter * cnv)736 _UTF7GetName(const UConverter *cnv) {
737 switch(cnv->fromUnicodeStatus>>28) {
738 case 1:
739 return "UTF-7,version=1";
740 default:
741 return "UTF-7";
742 }
743 }
744
745 static const UConverterImpl _UTF7Impl={
746 UCNV_UTF7,
747
748 NULL,
749 NULL,
750
751 _UTF7Open,
752 NULL,
753 _UTF7Reset,
754
755 _UTF7ToUnicodeWithOffsets,
756 _UTF7ToUnicodeWithOffsets,
757 _UTF7FromUnicodeWithOffsets,
758 _UTF7FromUnicodeWithOffsets,
759 NULL,
760
761 NULL,
762 _UTF7GetName,
763 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
764 NULL,
765 ucnv_getCompleteUnicodeSet,
766
767 NULL,
768 NULL
769 };
770
771 static const UConverterStaticData _UTF7StaticData={
772 sizeof(UConverterStaticData),
773 "UTF-7",
774 0, /* TODO CCSID for UTF-7 */
775 UCNV_IBM, UCNV_UTF7,
776 1, 4,
777 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
778 FALSE, FALSE,
779 0,
780 0,
781 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
782 };
783
784 const UConverterSharedData _UTF7Data=
785 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
786
787 /* IMAP mailbox name encoding ----------------------------------------------- */
788
789 /*
790 * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
791 * http://www.ietf.org/rfc/rfc2060.txt
792 *
793 * 5.1.3. Mailbox International Naming Convention
794 *
795 * By convention, international mailbox names are specified using a
796 * modified version of the UTF-7 encoding described in [UTF-7]. The
797 * purpose of these modifications is to correct the following problems
798 * with UTF-7:
799 *
800 * 1) UTF-7 uses the "+" character for shifting; this conflicts with
801 * the common use of "+" in mailbox names, in particular USENET
802 * newsgroup names.
803 *
804 * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
805 * conflicts with the use of "/" as a popular hierarchy delimiter.
806 *
807 * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
808 * the use of "\" as a popular hierarchy delimiter.
809 *
810 * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
811 * the use of "~" in some servers as a home directory indicator.
812 *
813 * 5) UTF-7 permits multiple alternate forms to represent the same
814 * string; in particular, printable US-ASCII chararacters can be
815 * represented in encoded form.
816 *
817 * In modified UTF-7, printable US-ASCII characters except for "&"
818 * represent themselves; that is, characters with octet values 0x20-0x25
819 * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
820 * octet sequence "&-".
821 *
822 * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
823 * Unicode 16-bit octets) are represented in modified BASE64, with a
824 * further modification from [UTF-7] that "," is used instead of "/".
825 * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
826 * character which can represent itself.
827 *
828 * "&" is used to shift to modified BASE64 and "-" to shift back to US-
829 * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
830 * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
831 * ").
832 *
833 * For example, here is a mailbox name which mixes English, Japanese,
834 * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
835 */
836
837 /*
838 * Tests for US-ASCII characters belonging to character classes
839 * defined in UTF-7.
840 *
841 * Set D (directly encoded characters) consists of the following
842 * characters: the upper and lower case letters A through Z
843 * and a through z, the 10 digits 0-9, and the following nine special
844 * characters (note that "+" and "=" are omitted):
845 * '(),-./:?
846 *
847 * Set O (optional direct characters) consists of the following
848 * characters (note that "\" and "~" are omitted):
849 * !"#$%&*;<=>@[]^_`{|}
850 *
851 * According to the rules in RFC 2152, the byte values for the following
852 * US-ASCII characters are not used in UTF-7 and are therefore illegal:
853 * - all C0 control codes except for CR LF TAB
854 * - BACKSLASH
855 * - TILDE
856 * - DEL
857 * - all codes beyond US-ASCII, i.e. all >127
858 */
859
860 /* uses '&' not '+' to start a base64 sequence */
861 #define AMPERSAND 0x26
862 #define COMMA 0x2c
863 #define SLASH 0x2f
864
865 /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
866 #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
867
868 /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
869 #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
870
871 #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
872 #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
873
874 /*
875 * converter status values:
876 *
877 * toUnicodeStatus:
878 * 24 inDirectMode (boolean)
879 * 23..16 base64Counter (-1..7)
880 * 15..0 bits (up to 14 bits incoming base64)
881 *
882 * fromUnicodeStatus:
883 * 24 inDirectMode (boolean)
884 * 23..16 base64Counter (0..2)
885 * 7..0 bits (6 bits outgoing base64)
886 *
887 * ignore bits 31..25
888 */
889
890 static void
_IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs * pArgs,UErrorCode * pErrorCode)891 _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
892 UErrorCode *pErrorCode) {
893 UConverter *cnv;
894 const uint8_t *source, *sourceLimit;
895 UChar *target;
896 const UChar *targetLimit;
897 int32_t *offsets;
898
899 uint8_t *bytes;
900 uint8_t byteIndex;
901
902 int32_t length, targetCapacity;
903
904 /* UTF-7 state */
905 uint16_t bits;
906 int8_t base64Counter;
907 UBool inDirectMode;
908
909 int8_t base64Value;
910
911 int32_t sourceIndex, nextSourceIndex;
912
913 UChar c;
914 uint8_t b;
915
916 /* set up the local pointers */
917 cnv=pArgs->converter;
918
919 source=(const uint8_t *)pArgs->source;
920 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
921 target=pArgs->target;
922 targetLimit=pArgs->targetLimit;
923 offsets=pArgs->offsets;
924 /* get the state machine state */
925 {
926 uint32_t status=cnv->toUnicodeStatus;
927 inDirectMode=(UBool)((status>>24)&1);
928 base64Counter=(int8_t)(status>>16);
929 bits=(uint16_t)status;
930 }
931 bytes=cnv->toUBytes;
932 byteIndex=cnv->toULength;
933
934 /* sourceIndex=-1 if the current character began in the previous buffer */
935 sourceIndex=byteIndex==0 ? 0 : -1;
936 nextSourceIndex=0;
937
938 if(inDirectMode) {
939 directMode:
940 /*
941 * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
942 * with their US-ASCII byte values.
943 * An ampersand starts Unicode (or "escape") Mode.
944 *
945 * In Direct Mode, only the sourceIndex is used.
946 */
947 byteIndex=0;
948 length=(int32_t)(sourceLimit-source);
949 targetCapacity=(int32_t)(targetLimit-target);
950 if(length>targetCapacity) {
951 length=targetCapacity;
952 }
953 while(length>0) {
954 b=*source++;
955 if(!isLegalIMAP(b)) {
956 /* illegal */
957 bytes[0]=b;
958 byteIndex=1;
959 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
960 break;
961 } else if(b!=AMPERSAND) {
962 /* write directly encoded character */
963 *target++=b;
964 if(offsets!=NULL) {
965 *offsets++=sourceIndex++;
966 }
967 } else /* AMPERSAND */ {
968 /* switch to Unicode mode */
969 nextSourceIndex=++sourceIndex;
970 inDirectMode=FALSE;
971 byteIndex=0;
972 bits=0;
973 base64Counter=-1;
974 goto unicodeMode;
975 }
976 --length;
977 }
978 if(source<sourceLimit && target>=targetLimit) {
979 /* target is full */
980 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
981 }
982 } else {
983 unicodeMode:
984 /*
985 * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
986 * The base64 sequence ends with any character that is not in the base64 alphabet.
987 * A terminating minus sign is consumed.
988 * US-ASCII must not be base64-ed.
989 *
990 * In Unicode Mode, the sourceIndex has the index to the start of the current
991 * base64 bytes, while nextSourceIndex is precisely parallel to source,
992 * keeping the index to the following byte.
993 * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
994 */
995 while(source<sourceLimit) {
996 if(target<targetLimit) {
997 bytes[byteIndex++]=b=*source++;
998 ++nextSourceIndex;
999 if(b>0x7e) {
1000 /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1001 inDirectMode=TRUE;
1002 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1003 break;
1004 } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1005 /* collect base64 bytes into UChars */
1006 switch(base64Counter) {
1007 case -1: /* -1 is immediately after the & */
1008 case 0:
1009 bits=base64Value;
1010 base64Counter=1;
1011 break;
1012 case 1:
1013 case 3:
1014 case 4:
1015 case 6:
1016 bits=(uint16_t)((bits<<6)|base64Value);
1017 ++base64Counter;
1018 break;
1019 case 2:
1020 c=(UChar)((bits<<4)|(base64Value>>2));
1021 if(isLegalIMAP(c)) {
1022 /* illegal */
1023 inDirectMode=TRUE;
1024 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1025 goto endloop;
1026 }
1027 *target++=c;
1028 if(offsets!=NULL) {
1029 *offsets++=sourceIndex;
1030 sourceIndex=nextSourceIndex-1;
1031 }
1032 bytes[0]=b; /* keep this byte in case an error occurs */
1033 byteIndex=1;
1034 bits=(uint16_t)(base64Value&3);
1035 base64Counter=3;
1036 break;
1037 case 5:
1038 c=(UChar)((bits<<2)|(base64Value>>4));
1039 if(isLegalIMAP(c)) {
1040 /* illegal */
1041 inDirectMode=TRUE;
1042 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1043 goto endloop;
1044 }
1045 *target++=c;
1046 if(offsets!=NULL) {
1047 *offsets++=sourceIndex;
1048 sourceIndex=nextSourceIndex-1;
1049 }
1050 bytes[0]=b; /* keep this byte in case an error occurs */
1051 byteIndex=1;
1052 bits=(uint16_t)(base64Value&15);
1053 base64Counter=6;
1054 break;
1055 case 7:
1056 c=(UChar)((bits<<6)|base64Value);
1057 if(isLegalIMAP(c)) {
1058 /* illegal */
1059 inDirectMode=TRUE;
1060 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1061 goto endloop;
1062 }
1063 *target++=c;
1064 if(offsets!=NULL) {
1065 *offsets++=sourceIndex;
1066 sourceIndex=nextSourceIndex;
1067 }
1068 byteIndex=0;
1069 bits=0;
1070 base64Counter=0;
1071 break;
1072 default:
1073 /* will never occur */
1074 break;
1075 }
1076 } else if(base64Value==-2) {
1077 /* minus sign terminates the base64 sequence */
1078 inDirectMode=TRUE;
1079 if(base64Counter==-1) {
1080 /* &- i.e. a minus immediately following an ampersand */
1081 *target++=AMPERSAND;
1082 if(offsets!=NULL) {
1083 *offsets++=sourceIndex-1;
1084 }
1085 } else {
1086 /* absorb the minus and leave the Unicode Mode */
1087 if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1088 /* bits are illegally left over, a UChar is incomplete */
1089 /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1090 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1091 break;
1092 }
1093 }
1094 sourceIndex=nextSourceIndex;
1095 goto directMode;
1096 } else {
1097 if(base64Counter==-1) {
1098 /* illegal: & immediately followed by something other than base64 or minus sign */
1099 /* include the ampersand in the reported sequence */
1100 --sourceIndex;
1101 bytes[0]=AMPERSAND;
1102 bytes[1]=b;
1103 byteIndex=2;
1104 }
1105 /* base64Value==-1 for characters that are illegal only in Unicode mode */
1106 /* base64Value==-3 for illegal characters */
1107 /* illegal */
1108 inDirectMode=TRUE;
1109 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1110 break;
1111 }
1112 } else {
1113 /* target is full */
1114 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1115 break;
1116 }
1117 }
1118 }
1119 endloop:
1120
1121 /*
1122 * the end of the input stream and detection of truncated input
1123 * are handled by the framework, but here we must check if we are in Unicode
1124 * mode and byteIndex==0 because we must end in direct mode
1125 *
1126 * conditions:
1127 * successful
1128 * in Unicode mode and byteIndex==0
1129 * end of input and no truncated input
1130 */
1131 if( U_SUCCESS(*pErrorCode) &&
1132 !inDirectMode && byteIndex==0 &&
1133 pArgs->flush && source>=sourceLimit
1134 ) {
1135 if(base64Counter==-1) {
1136 /* & at the very end of the input */
1137 /* make the ampersand the reported sequence */
1138 bytes[0]=AMPERSAND;
1139 byteIndex=1;
1140 }
1141 /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1142
1143 inDirectMode=TRUE; /* avoid looping */
1144 *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1145 }
1146
1147 /* set the converter state back into UConverter */
1148 cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1149 cnv->toULength=byteIndex;
1150
1151 /* write back the updated pointers */
1152 pArgs->source=(const char *)source;
1153 pArgs->target=target;
1154 pArgs->offsets=offsets;
1155 return;
1156 }
1157
1158 static void
_IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs * pArgs,UErrorCode * pErrorCode)1159 _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1160 UErrorCode *pErrorCode) {
1161 UConverter *cnv;
1162 const UChar *source, *sourceLimit;
1163 uint8_t *target, *targetLimit;
1164 int32_t *offsets;
1165
1166 int32_t length, targetCapacity, sourceIndex;
1167 UChar c;
1168 uint8_t b;
1169
1170 /* UTF-7 state */
1171 uint8_t bits;
1172 int8_t base64Counter;
1173 UBool inDirectMode;
1174
1175 /* set up the local pointers */
1176 cnv=pArgs->converter;
1177
1178 /* set up the local pointers */
1179 source=pArgs->source;
1180 sourceLimit=pArgs->sourceLimit;
1181 target=(uint8_t *)pArgs->target;
1182 targetLimit=(uint8_t *)pArgs->targetLimit;
1183 offsets=pArgs->offsets;
1184
1185 /* get the state machine state */
1186 {
1187 uint32_t status=cnv->fromUnicodeStatus;
1188 inDirectMode=(UBool)((status>>24)&1);
1189 base64Counter=(int8_t)(status>>16);
1190 bits=(uint8_t)status;
1191 }
1192
1193 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1194 sourceIndex=0;
1195
1196 if(inDirectMode) {
1197 directMode:
1198 length=(int32_t)(sourceLimit-source);
1199 targetCapacity=(int32_t)(targetLimit-target);
1200 if(length>targetCapacity) {
1201 length=targetCapacity;
1202 }
1203 while(length>0) {
1204 c=*source++;
1205 /* encode 0x20..0x7e except '&' directly */
1206 if(inSetDIMAP(c)) {
1207 /* encode directly */
1208 *target++=(uint8_t)c;
1209 if(offsets!=NULL) {
1210 *offsets++=sourceIndex++;
1211 }
1212 } else if(c==AMPERSAND) {
1213 /* output &- for & */
1214 *target++=AMPERSAND;
1215 if(target<targetLimit) {
1216 *target++=MINUS;
1217 if(offsets!=NULL) {
1218 *offsets++=sourceIndex;
1219 *offsets++=sourceIndex++;
1220 }
1221 /* realign length and targetCapacity */
1222 goto directMode;
1223 } else {
1224 if(offsets!=NULL) {
1225 *offsets++=sourceIndex++;
1226 }
1227 cnv->charErrorBuffer[0]=MINUS;
1228 cnv->charErrorBufferLength=1;
1229 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1230 break;
1231 }
1232 } else {
1233 /* un-read this character and switch to Unicode Mode */
1234 --source;
1235 *target++=AMPERSAND;
1236 if(offsets!=NULL) {
1237 *offsets++=sourceIndex;
1238 }
1239 inDirectMode=FALSE;
1240 base64Counter=0;
1241 goto unicodeMode;
1242 }
1243 --length;
1244 }
1245 if(source<sourceLimit && target>=targetLimit) {
1246 /* target is full */
1247 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1248 }
1249 } else {
1250 unicodeMode:
1251 while(source<sourceLimit) {
1252 if(target<targetLimit) {
1253 c=*source++;
1254 if(isLegalIMAP(c)) {
1255 /* encode directly */
1256 inDirectMode=TRUE;
1257
1258 /* trick: back out this character to make this easier */
1259 --source;
1260
1261 /* terminate the base64 sequence */
1262 if(base64Counter!=0) {
1263 /* write remaining bits for the previous character */
1264 *target++=TO_BASE64_IMAP(bits);
1265 if(offsets!=NULL) {
1266 *offsets++=sourceIndex-1;
1267 }
1268 }
1269 /* need to terminate with a minus */
1270 if(target<targetLimit) {
1271 *target++=MINUS;
1272 if(offsets!=NULL) {
1273 *offsets++=sourceIndex-1;
1274 }
1275 } else {
1276 cnv->charErrorBuffer[0]=MINUS;
1277 cnv->charErrorBufferLength=1;
1278 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1279 break;
1280 }
1281 goto directMode;
1282 } else {
1283 /*
1284 * base64 this character:
1285 * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1286 * and the bits of this character, each implicitly in UTF-16BE.
1287 *
1288 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1289 * character to the next. The actual 2 or 4 bits are shifted to the left edge
1290 * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1291 */
1292 switch(base64Counter) {
1293 case 0:
1294 b=(uint8_t)(c>>10);
1295 *target++=TO_BASE64_IMAP(b);
1296 if(target<targetLimit) {
1297 b=(uint8_t)((c>>4)&0x3f);
1298 *target++=TO_BASE64_IMAP(b);
1299 if(offsets!=NULL) {
1300 *offsets++=sourceIndex;
1301 *offsets++=sourceIndex++;
1302 }
1303 } else {
1304 if(offsets!=NULL) {
1305 *offsets++=sourceIndex++;
1306 }
1307 b=(uint8_t)((c>>4)&0x3f);
1308 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1309 cnv->charErrorBufferLength=1;
1310 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1311 }
1312 bits=(uint8_t)((c&15)<<2);
1313 base64Counter=1;
1314 break;
1315 case 1:
1316 b=(uint8_t)(bits|(c>>14));
1317 *target++=TO_BASE64_IMAP(b);
1318 if(target<targetLimit) {
1319 b=(uint8_t)((c>>8)&0x3f);
1320 *target++=TO_BASE64_IMAP(b);
1321 if(target<targetLimit) {
1322 b=(uint8_t)((c>>2)&0x3f);
1323 *target++=TO_BASE64_IMAP(b);
1324 if(offsets!=NULL) {
1325 *offsets++=sourceIndex;
1326 *offsets++=sourceIndex;
1327 *offsets++=sourceIndex++;
1328 }
1329 } else {
1330 if(offsets!=NULL) {
1331 *offsets++=sourceIndex;
1332 *offsets++=sourceIndex++;
1333 }
1334 b=(uint8_t)((c>>2)&0x3f);
1335 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1336 cnv->charErrorBufferLength=1;
1337 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1338 }
1339 } else {
1340 if(offsets!=NULL) {
1341 *offsets++=sourceIndex++;
1342 }
1343 b=(uint8_t)((c>>8)&0x3f);
1344 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1345 b=(uint8_t)((c>>2)&0x3f);
1346 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1347 cnv->charErrorBufferLength=2;
1348 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1349 }
1350 bits=(uint8_t)((c&3)<<4);
1351 base64Counter=2;
1352 break;
1353 case 2:
1354 b=(uint8_t)(bits|(c>>12));
1355 *target++=TO_BASE64_IMAP(b);
1356 if(target<targetLimit) {
1357 b=(uint8_t)((c>>6)&0x3f);
1358 *target++=TO_BASE64_IMAP(b);
1359 if(target<targetLimit) {
1360 b=(uint8_t)(c&0x3f);
1361 *target++=TO_BASE64_IMAP(b);
1362 if(offsets!=NULL) {
1363 *offsets++=sourceIndex;
1364 *offsets++=sourceIndex;
1365 *offsets++=sourceIndex++;
1366 }
1367 } else {
1368 if(offsets!=NULL) {
1369 *offsets++=sourceIndex;
1370 *offsets++=sourceIndex++;
1371 }
1372 b=(uint8_t)(c&0x3f);
1373 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1374 cnv->charErrorBufferLength=1;
1375 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1376 }
1377 } else {
1378 if(offsets!=NULL) {
1379 *offsets++=sourceIndex++;
1380 }
1381 b=(uint8_t)((c>>6)&0x3f);
1382 cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1383 b=(uint8_t)(c&0x3f);
1384 cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1385 cnv->charErrorBufferLength=2;
1386 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1387 }
1388 bits=0;
1389 base64Counter=0;
1390 break;
1391 default:
1392 /* will never occur */
1393 break;
1394 }
1395 }
1396 } else {
1397 /* target is full */
1398 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1399 break;
1400 }
1401 }
1402 }
1403
1404 if(pArgs->flush && source>=sourceLimit) {
1405 /* flush remaining bits to the target */
1406 if(!inDirectMode) {
1407 if(base64Counter!=0) {
1408 if(target<targetLimit) {
1409 *target++=TO_BASE64_IMAP(bits);
1410 if(offsets!=NULL) {
1411 *offsets++=sourceIndex-1;
1412 }
1413 } else {
1414 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1415 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1416 }
1417 }
1418 /* need to terminate with a minus */
1419 if(target<targetLimit) {
1420 *target++=MINUS;
1421 if(offsets!=NULL) {
1422 *offsets++=sourceIndex-1;
1423 }
1424 } else {
1425 cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1426 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1427 }
1428 }
1429 /* reset the state for the next conversion */
1430 cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1431 } else {
1432 /* set the converter state back into UConverter */
1433 cnv->fromUnicodeStatus=
1434 (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1435 ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1436 }
1437
1438 /* write back the updated pointers */
1439 pArgs->source=source;
1440 pArgs->target=(char *)target;
1441 pArgs->offsets=offsets;
1442 return;
1443 }
1444
1445 static const UConverterImpl _IMAPImpl={
1446 UCNV_IMAP_MAILBOX,
1447
1448 NULL,
1449 NULL,
1450
1451 _UTF7Open,
1452 NULL,
1453 _UTF7Reset,
1454
1455 _IMAPToUnicodeWithOffsets,
1456 _IMAPToUnicodeWithOffsets,
1457 _IMAPFromUnicodeWithOffsets,
1458 _IMAPFromUnicodeWithOffsets,
1459 NULL,
1460
1461 NULL,
1462 NULL,
1463 NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1464 NULL,
1465 ucnv_getCompleteUnicodeSet
1466 };
1467
1468 static const UConverterStaticData _IMAPStaticData={
1469 sizeof(UConverterStaticData),
1470 "IMAP-mailbox-name",
1471 0, /* TODO CCSID for IMAP-mailbox-name */
1472 UCNV_IBM, UCNV_IMAP_MAILBOX,
1473 1, 4,
1474 { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1475 FALSE, FALSE,
1476 0,
1477 0,
1478 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1479 };
1480
1481 const UConverterSharedData _IMAPData=
1482 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1483
1484 #endif
1485