1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *****************************************************************************
5  *
6  *   Copyright (C) 1998-2016, International Business Machines
7  *   Corporation and others.  All Rights Reserved.
8  *
9  *****************************************************************************
10  *
11  *  ucnv_err.c
12  *  Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13  *
14  *
15 *   Change history:
16 *
17 *   06/29/2000  helena      Major rewrite of the callback APIs.
18 */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_CONVERSION
23 
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ucnv_cb.h"
26 #include "ucnv_cnv.h"
27 #include "cmemory.h"
28 #include "unicode/ucnv.h"
29 #include "ustrfmt.h"
30 
31 #define VALUE_STRING_LENGTH 48
32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 #define UNICODE_PERCENT_SIGN_CODEPOINT  0x0025
34 #define UNICODE_U_CODEPOINT             0x0055
35 #define UNICODE_X_CODEPOINT             0x0058
36 #define UNICODE_RS_CODEPOINT            0x005C
37 #define UNICODE_U_LOW_CODEPOINT         0x0075
38 #define UNICODE_X_LOW_CODEPOINT         0x0078
39 #define UNICODE_AMP_CODEPOINT           0x0026
40 #define UNICODE_HASH_CODEPOINT          0x0023
41 #define UNICODE_SEMICOLON_CODEPOINT     0x003B
42 #define UNICODE_PLUS_CODEPOINT          0x002B
43 #define UNICODE_LEFT_CURLY_CODEPOINT    0x007B
44 #define UNICODE_RIGHT_CURLY_CODEPOINT   0x007D
45 #define UNICODE_SPACE_CODEPOINT         0x0020
46 #define UCNV_PRV_ESCAPE_ICU         0
47 #define UCNV_PRV_ESCAPE_C           'C'
48 #define UCNV_PRV_ESCAPE_XML_DEC     'D'
49 #define UCNV_PRV_ESCAPE_XML_HEX     'X'
50 #define UCNV_PRV_ESCAPE_JAVA        'J'
51 #define UCNV_PRV_ESCAPE_UNICODE     'U'
52 #define UCNV_PRV_ESCAPE_CSS2        'S'
53 #define UCNV_PRV_STOP_ON_ILLEGAL    'i'
54 
55 /*
56  * IS_DEFAULT_IGNORABLE_CODE_POINT
57  * This is to check if a code point has the default ignorable unicode property.
58  * As such, this list needs to be updated if the ignorable code point list ever
59  * changes.
60  * To avoid dependency on other code, this list is hard coded here.
61  * When an ignorable code point is found and is unmappable, the default callbacks
62  * will ignore them.
63  * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
64  *
65  * This list should be sync with the one in CharsetCallback.java
66  */
67 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
68     (c == 0x00AD) || \
69     (c == 0x034F) || \
70     (c == 0x061C) || \
71     (c == 0x115F) || \
72     (c == 0x1160) || \
73     (0x17B4 <= c && c <= 0x17B5) || \
74     (0x180B <= c && c <= 0x180E) || \
75     (0x200B <= c && c <= 0x200F) || \
76     (0x202A <= c && c <= 0x202E) || \
77     (c == 0x2060) || \
78     (0x2066 <= c && c <= 0x2069) || \
79     (0x2061 <= c && c <= 0x2064) || \
80     (0x206A <= c && c <= 0x206F) || \
81     (c == 0x3164) || \
82     (0x0FE00 <= c && c <= 0x0FE0F) || \
83     (c == 0x0FEFF) || \
84     (c == 0x0FFA0) || \
85     (0x01BCA0  <= c && c <= 0x01BCA3) || \
86     (0x01D173 <= c && c <= 0x01D17A) || \
87     (c == 0x0E0001) || \
88     (0x0E0020 <= c && c <= 0x0E007F) || \
89     (0x0E0100 <= c && c <= 0x0E01EF) || \
90     (c == 0x2065) || \
91     (0x0FFF0 <= c && c <= 0x0FFF8) || \
92     (c == 0x0E0000) || \
93     (0x0E0002 <= c && c <= 0x0E001F) || \
94     (0x0E0080 <= c && c <= 0x0E00FF) || \
95     (0x0E01F0 <= c && c <= 0x0E0FFF) \
96     )
97 
98 
99 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
100 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_STOP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)101 UCNV_FROM_U_CALLBACK_STOP (
102                   const void *context,
103                   UConverterFromUnicodeArgs *fromUArgs,
104                   const UChar* codeUnits,
105                   int32_t length,
106                   UChar32 codePoint,
107                   UConverterCallbackReason reason,
108                   UErrorCode * err)
109 {
110     (void)context;
111     (void)fromUArgs;
112     (void)codeUnits;
113     (void)length;
114     if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
115     {
116         /*
117          * Skip if the codepoint has unicode property of default ignorable.
118          */
119         *err = U_ZERO_ERROR;
120     }
121     /* the caller must have set the error code accordingly */
122     return;
123 }
124 
125 
126 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
127 U_CAPI void    U_EXPORT2
UCNV_TO_U_CALLBACK_STOP(const void * context,UConverterToUnicodeArgs * toUArgs,const char * codePoints,int32_t length,UConverterCallbackReason reason,UErrorCode * err)128 UCNV_TO_U_CALLBACK_STOP (
129                    const void *context,
130                    UConverterToUnicodeArgs *toUArgs,
131                    const char* codePoints,
132                    int32_t length,
133                    UConverterCallbackReason reason,
134                    UErrorCode * err)
135 {
136     /* the caller must have set the error code accordingly */
137     (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
138     return;
139 }
140 
141 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_SKIP(const void * context,UConverterFromUnicodeArgs * fromUArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)142 UCNV_FROM_U_CALLBACK_SKIP (
143                   const void *context,
144                   UConverterFromUnicodeArgs *fromUArgs,
145                   const UChar* codeUnits,
146                   int32_t length,
147                   UChar32 codePoint,
148                   UConverterCallbackReason reason,
149                   UErrorCode * err)
150 {
151     (void)fromUArgs;
152     (void)codeUnits;
153     (void)length;
154     if (reason <= UCNV_IRREGULAR)
155     {
156         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
157         {
158             /*
159              * Skip if the codepoint has unicode property of default ignorable.
160              */
161             *err = U_ZERO_ERROR;
162         }
163         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
164         {
165             *err = U_ZERO_ERROR;
166         }
167         /* else the caller must have set the error code accordingly. */
168     }
169     /* else ignore the reset, close and clone calls. */
170 }
171 
172 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_SUBSTITUTE(const void * context,UConverterFromUnicodeArgs * fromArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)173 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
174                   const void *context,
175                   UConverterFromUnicodeArgs *fromArgs,
176                   const UChar* codeUnits,
177                   int32_t length,
178                   UChar32 codePoint,
179                   UConverterCallbackReason reason,
180                   UErrorCode * err)
181 {
182     (void)codeUnits;
183     (void)length;
184     if (reason <= UCNV_IRREGULAR)
185     {
186         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
187         {
188             /*
189              * Skip if the codepoint has unicode property of default ignorable.
190              */
191             *err = U_ZERO_ERROR;
192         }
193         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
194         {
195             *err = U_ZERO_ERROR;
196             ucnv_cbFromUWriteSub(fromArgs, 0, err);
197         }
198         /* else the caller must have set the error code accordingly. */
199     }
200     /* else ignore the reset, close and clone calls. */
201 }
202 
203 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
204  *uses a clean copy (resetted) of the converter, to convert that unicode
205  *escape sequence to the target codepage (if conversion failure happens then
206  *we revert to substituting with subchar)
207  */
208 U_CAPI void    U_EXPORT2
UCNV_FROM_U_CALLBACK_ESCAPE(const void * context,UConverterFromUnicodeArgs * fromArgs,const UChar * codeUnits,int32_t length,UChar32 codePoint,UConverterCallbackReason reason,UErrorCode * err)209 UCNV_FROM_U_CALLBACK_ESCAPE (
210                          const void *context,
211                          UConverterFromUnicodeArgs *fromArgs,
212                          const UChar *codeUnits,
213                          int32_t length,
214                          UChar32 codePoint,
215                          UConverterCallbackReason reason,
216                          UErrorCode * err)
217 {
218 
219   UChar valueString[VALUE_STRING_LENGTH];
220   int32_t valueStringLength = 0;
221   int32_t i = 0;
222 
223   const UChar *myValueSource = NULL;
224   UErrorCode err2 = U_ZERO_ERROR;
225   UConverterFromUCallback original = NULL;
226   const void *originalContext;
227 
228   UConverterFromUCallback ignoredCallback = NULL;
229   const void *ignoredContext;
230 
231   if (reason > UCNV_IRREGULAR)
232   {
233       return;
234   }
235   else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
236   {
237       /*
238        * Skip if the codepoint has unicode property of default ignorable.
239        */
240       *err = U_ZERO_ERROR;
241       return;
242   }
243 
244   ucnv_setFromUCallBack (fromArgs->converter,
245                      (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
246                      NULL,
247                      &original,
248                      &originalContext,
249                      &err2);
250 
251   if (U_FAILURE (err2))
252   {
253     *err = err2;
254     return;
255   }
256   if(context==NULL)
257   {
258       while (i < length)
259       {
260         valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
261         valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
262         valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
263       }
264   }
265   else
266   {
267       switch(*((char*)context))
268       {
269       case UCNV_PRV_ESCAPE_JAVA:
270           while (i < length)
271           {
272               valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
273               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
274               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
275           }
276           break;
277 
278       case UCNV_PRV_ESCAPE_C:
279           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
280 
281           if(length==2){
282               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
283               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
284 
285           }
286           else{
287               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
288               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
289           }
290           break;
291 
292       case UCNV_PRV_ESCAPE_XML_DEC:
293 
294           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
295           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
296           if(length==2){
297               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
298           }
299           else{
300               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
301           }
302           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
303           break;
304 
305       case UCNV_PRV_ESCAPE_XML_HEX:
306 
307           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
308           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
309           valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
310           if(length==2){
311               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
312           }
313           else{
314               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
315           }
316           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
317           break;
318 
319       case UCNV_PRV_ESCAPE_UNICODE:
320           valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
321           valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;    /* adding U */
322           valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */
323           if (length == 2) {
324               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
325           } else {
326               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
327           }
328           valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT;    /* adding } */
329           break;
330 
331       case UCNV_PRV_ESCAPE_CSS2:
332           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
333           valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
334           /* Always add space character, becase the next character might be whitespace,
335              which would erroneously be considered the termination of the escape sequence. */
336           valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
337           break;
338 
339       default:
340           while (i < length)
341           {
342               valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
343               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;             /* adding U */
344               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
345           }
346       }
347   }
348   myValueSource = valueString;
349 
350   /* reset the error */
351   *err = U_ZERO_ERROR;
352 
353   ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
354 
355   ucnv_setFromUCallBack (fromArgs->converter,
356                          original,
357                          originalContext,
358                          &ignoredCallback,
359                          &ignoredContext,
360                          &err2);
361   if (U_FAILURE (err2))
362   {
363       *err = err2;
364       return;
365   }
366 
367   return;
368 }
369 
370 
371 
372 U_CAPI void  U_EXPORT2
UCNV_TO_U_CALLBACK_SKIP(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)373 UCNV_TO_U_CALLBACK_SKIP (
374                  const void *context,
375                  UConverterToUnicodeArgs *toArgs,
376                  const char* codeUnits,
377                  int32_t length,
378                  UConverterCallbackReason reason,
379                  UErrorCode * err)
380 {
381     (void)toArgs;
382     (void)codeUnits;
383     (void)length;
384     if (reason <= UCNV_IRREGULAR)
385     {
386         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
387         {
388             *err = U_ZERO_ERROR;
389         }
390         /* else the caller must have set the error code accordingly. */
391     }
392     /* else ignore the reset, close and clone calls. */
393 }
394 
395 U_CAPI void    U_EXPORT2
UCNV_TO_U_CALLBACK_SUBSTITUTE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)396 UCNV_TO_U_CALLBACK_SUBSTITUTE (
397                  const void *context,
398                  UConverterToUnicodeArgs *toArgs,
399                  const char* codeUnits,
400                  int32_t length,
401                  UConverterCallbackReason reason,
402                  UErrorCode * err)
403 {
404     (void)codeUnits;
405     (void)length;
406     if (reason <= UCNV_IRREGULAR)
407     {
408         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
409         {
410             *err = U_ZERO_ERROR;
411             ucnv_cbToUWriteSub(toArgs,0,err);
412         }
413         /* else the caller must have set the error code accordingly. */
414     }
415     /* else ignore the reset, close and clone calls. */
416 }
417 
418 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
419  *and uses that as the substitution sequence
420  */
421 U_CAPI void   U_EXPORT2
UCNV_TO_U_CALLBACK_ESCAPE(const void * context,UConverterToUnicodeArgs * toArgs,const char * codeUnits,int32_t length,UConverterCallbackReason reason,UErrorCode * err)422 UCNV_TO_U_CALLBACK_ESCAPE (
423                  const void *context,
424                  UConverterToUnicodeArgs *toArgs,
425                  const char* codeUnits,
426                  int32_t length,
427                  UConverterCallbackReason reason,
428                  UErrorCode * err)
429 {
430     UChar uniValueString[VALUE_STRING_LENGTH];
431     int32_t valueStringLength = 0;
432     int32_t i = 0;
433 
434     if (reason > UCNV_IRREGULAR)
435     {
436         return;
437     }
438 
439     if(context==NULL)
440     {
441         while (i < length)
442         {
443             uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
444             uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
445             valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
446         }
447     }
448     else
449     {
450         switch(*((char*)context))
451         {
452         case UCNV_PRV_ESCAPE_XML_DEC:
453             while (i < length)
454             {
455                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
456                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
457                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
458                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
459             }
460             break;
461 
462         case UCNV_PRV_ESCAPE_XML_HEX:
463             while (i < length)
464             {
465                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
466                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
467                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
468                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
469                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
470             }
471             break;
472         case UCNV_PRV_ESCAPE_C:
473             while (i < length)
474             {
475                 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
476                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
477                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
478             }
479             break;
480         default:
481             while (i < length)
482             {
483                 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
484                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
485                 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
486                 valueStringLength += 2;
487             }
488         }
489     }
490     /* reset the error */
491     *err = U_ZERO_ERROR;
492 
493     ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
494 }
495 
496 #endif
497