1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2003-2013, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucm.c
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2003jun20
16 *   created by: Markus W. Scherer
17 *
18 *   This file reads a .ucm file, stores its mappings and sorts them.
19 *   It implements handling of Unicode conversion mappings from .ucm files
20 *   for makeconv, canonucm, rptp2ucm, etc.
21 *
22 *   Unicode code point sequences with a length of more than 1,
23 *   as well as byte sequences with more than 4 bytes or more than one complete
24 *   character sequence are handled to support m:n mappings.
25 */
26 
27 #include "unicode/utypes.h"
28 #include "unicode/ustring.h"
29 #include "cstring.h"
30 #include "cmemory.h"
31 #include "filestrm.h"
32 #include "uarrsort.h"
33 #include "ucnvmbcs.h"
34 #include "ucnv_bld.h"
35 #include "ucnv_ext.h"
36 #include "uparse.h"
37 #include "ucm.h"
38 #include <stdio.h>
39 
40 #if !UCONFIG_NO_CONVERSION
41 
42 /* -------------------------------------------------------------------------- */
43 
44 static void
printMapping(UCMapping * m,UChar32 * codePoints,uint8_t * bytes,FILE * f)45 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
46     int32_t j;
47 
48     for(j=0; j<m->uLen; ++j) {
49         fprintf(f, "<U%04lX>", (long)codePoints[j]);
50     }
51 
52     fputc(' ', f);
53 
54     for(j=0; j<m->bLen; ++j) {
55         fprintf(f, "\\x%02X", bytes[j]);
56     }
57 
58     if(m->f>=0) {
59         fprintf(f, " |%u\n", m->f);
60     } else {
61         fputs("\n", f);
62     }
63 }
64 
65 U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable * table,UCMapping * m,FILE * f)66 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
67     printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
68 }
69 
70 U_CAPI void U_EXPORT2
ucm_printTable(UCMTable * table,FILE * f,UBool byUnicode)71 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
72     UCMapping *m;
73     int32_t i, length;
74 
75     m=table->mappings;
76     length=table->mappingsLength;
77     if(byUnicode) {
78         for(i=0; i<length; ++m, ++i) {
79             ucm_printMapping(table, m, f);
80         }
81     } else {
82         const int32_t *map=table->reverseMap;
83         for(i=0; i<length; ++i) {
84             ucm_printMapping(table, m+map[i], f);
85         }
86     }
87 }
88 
89 /* mapping comparisons ------------------------------------------------------ */
90 
91 static int32_t
compareUnicode(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r)92 compareUnicode(UCMTable *lTable, const UCMapping *l,
93                UCMTable *rTable, const UCMapping *r) {
94     const UChar32 *lu, *ru;
95     int32_t result, i, length;
96 
97     if(l->uLen==1 && r->uLen==1) {
98         /* compare two single code points */
99         return l->u-r->u;
100     }
101 
102     /* get pointers to the code point sequences */
103     lu=UCM_GET_CODE_POINTS(lTable, l);
104     ru=UCM_GET_CODE_POINTS(rTable, r);
105 
106     /* get the minimum length */
107     if(l->uLen<=r->uLen) {
108         length=l->uLen;
109     } else {
110         length=r->uLen;
111     }
112 
113     /* compare the code points */
114     for(i=0; i<length; ++i) {
115         result=lu[i]-ru[i];
116         if(result!=0) {
117             return result;
118         }
119     }
120 
121     /* compare the lengths */
122     return l->uLen-r->uLen;
123 }
124 
125 static int32_t
compareBytes(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool lexical)126 compareBytes(UCMTable *lTable, const UCMapping *l,
127              UCMTable *rTable, const UCMapping *r,
128              UBool lexical) {
129     const uint8_t *lb, *rb;
130     int32_t result, i, length;
131 
132     /*
133      * A lexical comparison is used for sorting in the builder, to allow
134      * an efficient search for a byte sequence that could be a prefix
135      * of a previously entered byte sequence.
136      *
137      * Comparing by lengths first is for compatibility with old .ucm tools
138      * like canonucm and rptp2ucm.
139      */
140     if(lexical) {
141         /* get the minimum length and continue */
142         if(l->bLen<=r->bLen) {
143             length=l->bLen;
144         } else {
145             length=r->bLen;
146         }
147     } else {
148         /* compare lengths first */
149         result=l->bLen-r->bLen;
150         if(result!=0) {
151             return result;
152         } else {
153             length=l->bLen;
154         }
155     }
156 
157     /* get pointers to the byte sequences */
158     lb=UCM_GET_BYTES(lTable, l);
159     rb=UCM_GET_BYTES(rTable, r);
160 
161     /* compare the bytes */
162     for(i=0; i<length; ++i) {
163         result=lb[i]-rb[i];
164         if(result!=0) {
165             return result;
166         }
167     }
168 
169     /* compare the lengths */
170     return l->bLen-r->bLen;
171 }
172 
173 /* compare UCMappings for sorting */
174 static int32_t
compareMappings(UCMTable * lTable,const UCMapping * l,UCMTable * rTable,const UCMapping * r,UBool uFirst)175 compareMappings(UCMTable *lTable, const UCMapping *l,
176                 UCMTable *rTable, const UCMapping *r,
177                 UBool uFirst) {
178     int32_t result;
179 
180     /* choose which side to compare first */
181     if(uFirst) {
182         /* Unicode then bytes */
183         result=compareUnicode(lTable, l, rTable, r);
184         if(result==0) {
185             result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
186         }
187     } else {
188         /* bytes then Unicode */
189         result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
190         if(result==0) {
191             result=compareUnicode(lTable, l, rTable, r);
192         }
193     }
194 
195     if(result!=0) {
196         return result;
197     }
198 
199     /* compare the flags */
200     return l->f-r->f;
201 }
202 U_CDECL_BEGIN
203 /* sorting by Unicode first sorts mappings directly */
204 static int32_t  U_CALLCONV
compareMappingsUnicodeFirst(const void * context,const void * left,const void * right)205 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
206     return compareMappings(
207         (UCMTable *)context, (const UCMapping *)left,
208         (UCMTable *)context, (const UCMapping *)right, TRUE);
209 }
210 
211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
212 static int32_t U_CALLCONV
compareMappingsBytesFirst(const void * context,const void * left,const void * right)213 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
214     UCMTable *table=(UCMTable *)context;
215     int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
216     return compareMappings(
217         table, table->mappings+l,
218         table, table->mappings+r, FALSE);
219 }
220 U_CDECL_END
221 
222 U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable * t)223 ucm_sortTable(UCMTable *t) {
224     UErrorCode errorCode;
225     int32_t i;
226 
227     if(t->isSorted) {
228         return;
229     }
230 
231     errorCode=U_ZERO_ERROR;
232 
233     /* 1. sort by Unicode first */
234     uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
235                    compareMappingsUnicodeFirst, t,
236                    FALSE, &errorCode);
237 
238     /* build the reverseMap */
239     if(t->reverseMap==NULL) {
240         /*
241          * allocate mappingsCapacity instead of mappingsLength so that
242          * if mappings are added, the reverseMap need not be
243          * reallocated each time
244          * (see ucm_moveMappings() and ucm_addMapping())
245          */
246         t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
247         if(t->reverseMap==NULL) {
248             fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
249             exit(U_MEMORY_ALLOCATION_ERROR);
250         }
251     }
252     for(i=0; i<t->mappingsLength; ++i) {
253         t->reverseMap[i]=i;
254     }
255 
256     /* 2. sort reverseMap by mappings bytes first */
257     uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
258                    compareMappingsBytesFirst, t,
259                    FALSE, &errorCode);
260 
261     if(U_FAILURE(errorCode)) {
262         fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
263                 u_errorName(errorCode));
264         exit(errorCode);
265     }
266 
267     t->isSorted=TRUE;
268 }
269 
270 /*
271  * remove mappings with their move flag set from the base table
272  * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
273  */
274 U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable * base,UCMTable * ext)275 ucm_moveMappings(UCMTable *base, UCMTable *ext) {
276     UCMapping *mb, *mbLimit;
277     int8_t flag;
278 
279     mb=base->mappings;
280     mbLimit=mb+base->mappingsLength;
281 
282     while(mb<mbLimit) {
283         flag=mb->moveFlag;
284         if(flag!=0) {
285             /* reset the move flag */
286             mb->moveFlag=0;
287 
288             if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
289                 /* add the mapping to the extension table */
290                 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
291             }
292 
293             /* remove this mapping: move the last base mapping down and overwrite the current one */
294             if(mb<(mbLimit-1)) {
295                 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
296             }
297             --mbLimit;
298             --base->mappingsLength;
299             base->isSorted=FALSE;
300         } else {
301             ++mb;
302         }
303     }
304 }
305 
306 enum {
307     NEEDS_MOVE=1,
308     HAS_ERRORS=2
309 };
310 
311 static uint8_t
checkBaseExtUnicode(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)312 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
313                     UBool moveToExt, UBool intersectBase) {
314     (void)baseStates;
315 
316     UCMapping *mb, *me, *mbLimit, *meLimit;
317     int32_t cmp;
318     uint8_t result;
319 
320     mb=base->mappings;
321     mbLimit=mb+base->mappingsLength;
322 
323     me=ext->mappings;
324     meLimit=me+ext->mappingsLength;
325 
326     result=0;
327 
328     for(;;) {
329         /* skip irrelevant mappings on both sides */
330         for(;;) {
331             if(mb==mbLimit) {
332                 return result;
333             }
334 
335             if((0<=mb->f && mb->f<=2) || mb->f==4) {
336                 break;
337             }
338 
339             ++mb;
340         }
341 
342         for(;;) {
343             if(me==meLimit) {
344                 return result;
345             }
346 
347             if((0<=me->f && me->f<=2) || me->f==4) {
348                 break;
349             }
350 
351             ++me;
352         }
353 
354         /* compare the base and extension mappings */
355         cmp=compareUnicode(base, mb, ext, me);
356         if(cmp<0) {
357             if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
358                 /*
359                  * mapping in base but not in ext, move it
360                  *
361                  * if ext is DBCS, move DBCS mappings here
362                  * and check SBCS ones for Unicode prefix below
363                  */
364                 mb->moveFlag|=UCM_MOVE_TO_EXT;
365                 result|=NEEDS_MOVE;
366 
367             /* does mb map from an input sequence that is a prefix of me's? */
368             } else if( mb->uLen<me->uLen &&
369                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
370             ) {
371                 if(moveToExt) {
372                     /* mark this mapping to be moved to the extension table */
373                     mb->moveFlag|=UCM_MOVE_TO_EXT;
374                     result|=NEEDS_MOVE;
375                 } else {
376                     fprintf(stderr,
377                             "ucm error: the base table contains a mapping whose input sequence\n"
378                             "           is a prefix of the input sequence of an extension mapping\n");
379                     ucm_printMapping(base, mb, stderr);
380                     ucm_printMapping(ext, me, stderr);
381                     result|=HAS_ERRORS;
382                 }
383             }
384 
385             ++mb;
386         } else if(cmp==0) {
387             /*
388              * same output: remove the extension mapping,
389              * otherwise treat as an error
390              */
391             if( mb->f==me->f && mb->bLen==me->bLen &&
392                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
393             ) {
394                 me->moveFlag|=UCM_REMOVE_MAPPING;
395                 result|=NEEDS_MOVE;
396             } else if(intersectBase) {
397                 /* mapping in base but not in ext, move it */
398                 mb->moveFlag|=UCM_MOVE_TO_EXT;
399                 result|=NEEDS_MOVE;
400             } else {
401                 fprintf(stderr,
402                         "ucm error: the base table contains a mapping whose input sequence\n"
403                         "           is the same as the input sequence of an extension mapping\n"
404                         "           but it maps differently\n");
405                 ucm_printMapping(base, mb, stderr);
406                 ucm_printMapping(ext, me, stderr);
407                 result|=HAS_ERRORS;
408             }
409 
410             ++mb;
411         } else /* cmp>0 */ {
412             ++me;
413         }
414     }
415 }
416 
417 static uint8_t
checkBaseExtBytes(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UBool moveToExt,UBool intersectBase)418 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
419                   UBool moveToExt, UBool intersectBase) {
420     UCMapping *mb, *me;
421     int32_t *baseMap, *extMap;
422     int32_t b, e, bLimit, eLimit, cmp;
423     uint8_t result;
424     UBool isSISO;
425 
426     baseMap=base->reverseMap;
427     extMap=ext->reverseMap;
428 
429     b=e=0;
430     bLimit=base->mappingsLength;
431     eLimit=ext->mappingsLength;
432 
433     result=0;
434 
435     isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
436 
437     for(;;) {
438         /* skip irrelevant mappings on both sides */
439         for(;; ++b) {
440             if(b==bLimit) {
441                 return result;
442             }
443             mb=base->mappings+baseMap[b];
444 
445             if(intersectBase==2 && mb->bLen==1) {
446                 /*
447                  * comparing a base against a DBCS extension:
448                  * leave SBCS base mappings alone
449                  */
450                 continue;
451             }
452 
453             if(mb->f==0 || mb->f==3) {
454                 break;
455             }
456         }
457 
458         for(;;) {
459             if(e==eLimit) {
460                 return result;
461             }
462             me=ext->mappings+extMap[e];
463 
464             if(me->f==0 || me->f==3) {
465                 break;
466             }
467 
468             ++e;
469         }
470 
471         /* compare the base and extension mappings */
472         cmp=compareBytes(base, mb, ext, me, TRUE);
473         if(cmp<0) {
474             if(intersectBase) {
475                 /* mapping in base but not in ext, move it */
476                 mb->moveFlag|=UCM_MOVE_TO_EXT;
477                 result|=NEEDS_MOVE;
478 
479             /*
480              * does mb map from an input sequence that is a prefix of me's?
481              * for SI/SO tables, a single byte is never a prefix because it
482              * occurs in a separate single-byte state
483              */
484             } else if( mb->bLen<me->bLen &&
485                 (!isSISO || mb->bLen>1) &&
486                 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
487             ) {
488                 if(moveToExt) {
489                     /* mark this mapping to be moved to the extension table */
490                     mb->moveFlag|=UCM_MOVE_TO_EXT;
491                     result|=NEEDS_MOVE;
492                 } else {
493                     fprintf(stderr,
494                             "ucm error: the base table contains a mapping whose input sequence\n"
495                             "           is a prefix of the input sequence of an extension mapping\n");
496                     ucm_printMapping(base, mb, stderr);
497                     ucm_printMapping(ext, me, stderr);
498                     result|=HAS_ERRORS;
499                 }
500             }
501 
502             ++b;
503         } else if(cmp==0) {
504             /*
505              * same output: remove the extension mapping,
506              * otherwise treat as an error
507              */
508             if( mb->f==me->f && mb->uLen==me->uLen &&
509                 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
510             ) {
511                 me->moveFlag|=UCM_REMOVE_MAPPING;
512                 result|=NEEDS_MOVE;
513             } else if(intersectBase) {
514                 /* mapping in base but not in ext, move it */
515                 mb->moveFlag|=UCM_MOVE_TO_EXT;
516                 result|=NEEDS_MOVE;
517             } else {
518                 fprintf(stderr,
519                         "ucm error: the base table contains a mapping whose input sequence\n"
520                         "           is the same as the input sequence of an extension mapping\n"
521                         "           but it maps differently\n");
522                 ucm_printMapping(base, mb, stderr);
523                 ucm_printMapping(ext, me, stderr);
524                 result|=HAS_ERRORS;
525             }
526 
527             ++b;
528         } else /* cmp>0 */ {
529             ++e;
530         }
531     }
532 }
533 
534 U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable * table,UCMStates * baseStates)535 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
536     UCMapping *m, *mLimit;
537     int32_t count;
538     UBool isOK;
539 
540     m=table->mappings;
541     mLimit=m+table->mappingsLength;
542     isOK=TRUE;
543 
544     while(m<mLimit) {
545         count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
546         if(count<1) {
547             ucm_printMapping(table, m, stderr);
548             isOK=FALSE;
549         }
550         ++m;
551     }
552 
553     return isOK;
554 }
555 
556 U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates * baseStates,UCMTable * base,UCMTable * ext,UCMTable * moveTarget,UBool intersectBase)557 ucm_checkBaseExt(UCMStates *baseStates,
558                  UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
559                  UBool intersectBase) {
560     uint8_t result;
561 
562     /* if we have an extension table, we must always use precision flags */
563     if(base->flagsType&UCM_FLAGS_IMPLICIT) {
564         fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
565         return FALSE;
566     }
567     if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
568         fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
569         return FALSE;
570     }
571 
572     /* checking requires both tables to be sorted */
573     ucm_sortTable(base);
574     ucm_sortTable(ext);
575 
576     /* check */
577     result=
578         checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
579         checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
580 
581     if(result&HAS_ERRORS) {
582         return FALSE;
583     }
584 
585     if(result&NEEDS_MOVE) {
586         ucm_moveMappings(ext, NULL);
587         ucm_moveMappings(base, moveTarget);
588         ucm_sortTable(base);
589         ucm_sortTable(ext);
590         if(moveTarget!=NULL) {
591             ucm_sortTable(moveTarget);
592         }
593     }
594 
595     return TRUE;
596 }
597 
598 /* merge tables for rptp2ucm ------------------------------------------------ */
599 
600 U_CAPI void U_EXPORT2
ucm_mergeTables(UCMTable * fromUTable,UCMTable * toUTable,const uint8_t * subchar,int32_t subcharLength,uint8_t subchar1)601 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
602                 const uint8_t *subchar, int32_t subcharLength,
603                 uint8_t subchar1) {
604     UCMapping *fromUMapping, *toUMapping;
605     int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
606 
607     ucm_sortTable(fromUTable);
608     ucm_sortTable(toUTable);
609 
610     fromUMapping=fromUTable->mappings;
611     toUMapping=toUTable->mappings;
612 
613     fromUTop=fromUTable->mappingsLength;
614     toUTop=toUTable->mappingsLength;
615 
616     fromUIndex=toUIndex=0;
617 
618     while(fromUIndex<fromUTop && toUIndex<toUTop) {
619         cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
620         if(cmp==0) {
621             /* equal: roundtrip, nothing to do (flags are initially 0) */
622             ++fromUMapping;
623             ++toUMapping;
624 
625             ++fromUIndex;
626             ++toUIndex;
627         } else if(cmp<0) {
628             /*
629              * the fromU mapping does not have a toU counterpart:
630              * fallback Unicode->codepage
631              */
632             if( (fromUMapping->bLen==subcharLength &&
633                  0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
634                 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
635             ) {
636                 fromUMapping->f=2; /* SUB mapping */
637             } else {
638                 fromUMapping->f=1; /* normal fallback */
639             }
640 
641             ++fromUMapping;
642             ++fromUIndex;
643         } else {
644             /*
645              * the toU mapping does not have a fromU counterpart:
646              * (reverse) fallback codepage->Unicode, copy it to the fromU table
647              */
648 
649             /* ignore reverse fallbacks to Unicode SUB */
650             if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
651                 toUMapping->f=3; /* reverse fallback */
652                 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
653 
654                 /* the table may have been reallocated */
655                 fromUMapping=fromUTable->mappings+fromUIndex;
656             }
657 
658             ++toUMapping;
659             ++toUIndex;
660         }
661     }
662 
663     /* either one or both tables are exhausted */
664     while(fromUIndex<fromUTop) {
665         /* leftover fromU mappings are fallbacks */
666         if( (fromUMapping->bLen==subcharLength &&
667              0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
668             (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
669         ) {
670             fromUMapping->f=2; /* SUB mapping */
671         } else {
672             fromUMapping->f=1; /* normal fallback */
673         }
674 
675         ++fromUMapping;
676         ++fromUIndex;
677     }
678 
679     while(toUIndex<toUTop) {
680         /* leftover toU mappings are reverse fallbacks */
681 
682         /* ignore reverse fallbacks to Unicode SUB */
683         if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
684             toUMapping->f=3; /* reverse fallback */
685             ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
686         }
687 
688         ++toUMapping;
689         ++toUIndex;
690     }
691 
692     fromUTable->isSorted=FALSE;
693 }
694 
695 /* separate extension mappings out of base table for rptp2ucm --------------- */
696 
697 U_CAPI UBool U_EXPORT2
ucm_separateMappings(UCMFile * ucm,UBool isSISO)698 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
699     UCMTable *table;
700     UCMapping *m, *mLimit;
701     int32_t type;
702     UBool needsMove, isOK;
703 
704     table=ucm->base;
705     m=table->mappings;
706     mLimit=m+table->mappingsLength;
707 
708     needsMove=FALSE;
709     isOK=TRUE;
710 
711     for(; m<mLimit; ++m) {
712         if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
713             fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
714             ucm_printMapping(table, m, stderr);
715             m->moveFlag|=UCM_REMOVE_MAPPING;
716             needsMove=TRUE;
717             continue;
718         }
719 
720         type=ucm_mappingType(
721                 &ucm->states, m,
722                 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
723         if(type<0) {
724             /* illegal byte sequence */
725             printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
726             isOK=FALSE;
727         } else if(type>0) {
728             m->moveFlag|=UCM_MOVE_TO_EXT;
729             needsMove=TRUE;
730         }
731     }
732 
733     if(!isOK) {
734         return FALSE;
735     }
736     if(needsMove) {
737         ucm_moveMappings(ucm->base, ucm->ext);
738         return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
739     } else {
740         ucm_sortTable(ucm->base);
741         return TRUE;
742     }
743 }
744 
745 /* ucm parser --------------------------------------------------------------- */
746 
747 U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line,const char ** ps)748 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
749     const char *s=*ps;
750     char *end;
751     uint8_t byte;
752     int8_t bLen;
753 
754     bLen=0;
755     for(;;) {
756         /* skip an optional plus sign */
757         if(bLen>0 && *s=='+') {
758             ++s;
759         }
760         if(*s!='\\') {
761             break;
762         }
763 
764         if( s[1]!='x' ||
765             (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
766         ) {
767             fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
768             return -1;
769         }
770 
771         if(bLen==UCNV_EXT_MAX_BYTES) {
772             fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
773             return -1;
774         }
775         bytes[bLen++]=byte;
776         s=end;
777     }
778 
779     *ps=s;
780     return bLen;
781 }
782 
783 /* parse a mapping line; must not be empty */
784 U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES],const char * line)785 ucm_parseMappingLine(UCMapping *m,
786                      UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
787                      uint8_t bytes[UCNV_EXT_MAX_BYTES],
788                      const char *line) {
789     const char *s;
790     char *end;
791     UChar32 cp;
792     int32_t u16Length;
793     int8_t uLen, bLen, f;
794 
795     s=line;
796     uLen=bLen=0;
797 
798     /* parse code points */
799     for(;;) {
800         /* skip an optional plus sign */
801         if(uLen>0 && *s=='+') {
802             ++s;
803         }
804         if(*s!='<') {
805             break;
806         }
807 
808         if( s[1]!='U' ||
809             (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
810             *end!='>'
811         ) {
812             fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
813             return FALSE;
814         }
815         if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
816             fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
817             return FALSE;
818         }
819 
820         if(uLen==UCNV_EXT_MAX_UCHARS) {
821             fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
822             return FALSE;
823         }
824         codePoints[uLen++]=cp;
825         s=end+1;
826     }
827 
828     if(uLen==0) {
829         fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
830         return FALSE;
831     } else if(uLen==1) {
832         m->u=codePoints[0];
833     } else {
834         UErrorCode errorCode=U_ZERO_ERROR;
835         u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
836         if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
837             u16Length>UCNV_EXT_MAX_UCHARS
838         ) {
839             fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
840             return FALSE;
841         }
842     }
843 
844     s=u_skipWhitespace(s);
845 
846     /* parse bytes */
847     bLen=ucm_parseBytes(bytes, line, &s);
848 
849     if(bLen<0) {
850         return FALSE;
851     } else if(bLen==0) {
852         fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
853         return FALSE;
854     } else if(bLen<=4) {
855         uprv_memcpy(m->b.bytes, bytes, bLen);
856     }
857 
858     /* skip everything until the fallback indicator, even the start of a comment */
859     for(;;) {
860         if(*s==0) {
861             f=-1; /* no fallback indicator */
862             break;
863         } else if(*s=='|') {
864             f=(int8_t)(s[1]-'0');
865             if((uint8_t)f>4) {
866                 fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
867                 return FALSE;
868             }
869             break;
870         }
871         ++s;
872     }
873 
874     m->uLen=uLen;
875     m->bLen=bLen;
876     m->f=f;
877     return TRUE;
878 }
879 
880 /* general APIs ------------------------------------------------------------- */
881 
882 U_CAPI UCMTable * U_EXPORT2
ucm_openTable()883 ucm_openTable() {
884     UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
885     if(table==NULL) {
886         fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
887         exit(U_MEMORY_ALLOCATION_ERROR);
888     }
889 
890     memset(table, 0, sizeof(UCMTable));
891     return table;
892 }
893 
894 U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable * table)895 ucm_closeTable(UCMTable *table) {
896     if(table!=NULL) {
897         uprv_free(table->mappings);
898         uprv_free(table->codePoints);
899         uprv_free(table->bytes);
900         uprv_free(table->reverseMap);
901         uprv_free(table);
902     }
903 }
904 
905 U_CAPI void U_EXPORT2
ucm_resetTable(UCMTable * table)906 ucm_resetTable(UCMTable *table) {
907     if(table!=NULL) {
908         table->mappingsLength=0;
909         table->flagsType=0;
910         table->unicodeMask=0;
911         table->bytesLength=table->codePointsLength=0;
912         table->isSorted=FALSE;
913     }
914 }
915 
916 U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable * table,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])917 ucm_addMapping(UCMTable *table,
918                UCMapping *m,
919                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
920                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
921     UCMapping *tm;
922     UChar32 c;
923     int32_t idx;
924 
925     if(table->mappingsLength>=table->mappingsCapacity) {
926         /* make the mappings array larger */
927         if(table->mappingsCapacity==0) {
928             table->mappingsCapacity=1000;
929         } else {
930             table->mappingsCapacity*=10;
931         }
932         table->mappings=(UCMapping *)uprv_realloc(table->mappings,
933                                              table->mappingsCapacity*sizeof(UCMapping));
934         if(table->mappings==NULL) {
935             fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
936                             (int)table->mappingsCapacity);
937             exit(U_MEMORY_ALLOCATION_ERROR);
938         }
939 
940         if(table->reverseMap!=NULL) {
941             /* the reverseMap must be reallocated in a new sort */
942             uprv_free(table->reverseMap);
943             table->reverseMap=NULL;
944         }
945     }
946 
947     if(m->uLen>1 && table->codePointsCapacity==0) {
948         table->codePointsCapacity=10000;
949         table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
950         if(table->codePoints==NULL) {
951             fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
952                             (int)table->codePointsCapacity);
953             exit(U_MEMORY_ALLOCATION_ERROR);
954         }
955     }
956 
957     if(m->bLen>4 && table->bytesCapacity==0) {
958         table->bytesCapacity=10000;
959         table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
960         if(table->bytes==NULL) {
961             fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
962                             (int)table->bytesCapacity);
963             exit(U_MEMORY_ALLOCATION_ERROR);
964         }
965     }
966 
967     if(m->uLen>1) {
968         idx=table->codePointsLength;
969         table->codePointsLength+=m->uLen;
970         if(table->codePointsLength>table->codePointsCapacity) {
971             fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
972             exit(U_MEMORY_ALLOCATION_ERROR);
973         }
974 
975         uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
976         m->u=idx;
977     }
978 
979     if(m->bLen>4) {
980         idx=table->bytesLength;
981         table->bytesLength+=m->bLen;
982         if(table->bytesLength>table->bytesCapacity) {
983             fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
984             exit(U_MEMORY_ALLOCATION_ERROR);
985         }
986 
987         uprv_memcpy(table->bytes+idx, bytes, m->bLen);
988         m->b.idx=idx;
989     }
990 
991     /* set unicodeMask */
992     for(idx=0; idx<m->uLen; ++idx) {
993         c=codePoints[idx];
994         if(c>=0x10000) {
995             table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
996         } else if(U_IS_SURROGATE(c)) {
997             table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
998         }
999     }
1000 
1001     /* set flagsType */
1002     if(m->f<0) {
1003         table->flagsType|=UCM_FLAGS_IMPLICIT;
1004     } else {
1005         table->flagsType|=UCM_FLAGS_EXPLICIT;
1006     }
1007 
1008     tm=table->mappings+table->mappingsLength++;
1009     uprv_memcpy(tm, m, sizeof(UCMapping));
1010 
1011     table->isSorted=FALSE;
1012 }
1013 
1014 U_CAPI UCMFile * U_EXPORT2
ucm_open()1015 ucm_open() {
1016     UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1017     if(ucm==NULL) {
1018         fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1019         exit(U_MEMORY_ALLOCATION_ERROR);
1020     }
1021 
1022     memset(ucm, 0, sizeof(UCMFile));
1023 
1024     ucm->base=ucm_openTable();
1025     ucm->ext=ucm_openTable();
1026 
1027     ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1028     ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1029     ucm->states.outputType=-1;
1030     ucm->states.minCharLength=ucm->states.maxCharLength=1;
1031 
1032     return ucm;
1033 }
1034 
1035 U_CAPI void U_EXPORT2
ucm_close(UCMFile * ucm)1036 ucm_close(UCMFile *ucm) {
1037     if(ucm!=NULL) {
1038         ucm_closeTable(ucm->base);
1039         ucm_closeTable(ucm->ext);
1040         uprv_free(ucm);
1041     }
1042 }
1043 
1044 U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1045 ucm_mappingType(UCMStates *baseStates,
1046                 UCMapping *m,
1047                 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1048                 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1049     (void)codePoints;
1050     /* check validity of the bytes and count the characters in them */
1051     int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1052     if(count<1) {
1053         /* illegal byte sequence */
1054         return -1;
1055     }
1056 
1057     /*
1058      * Suitable for an ICU conversion base table means:
1059      * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1060      * - precision flag 0..3
1061      * - SBCS: any 1:1 mapping
1062      *         (the table stores additional bits to distinguish mapping types)
1063      * - MBCS: not a |2 SUB mapping for <subchar1>
1064      * - MBCS: not a |1 fallback to 0x00
1065      * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1066      *
1067      * Further restrictions for fromUnicode tables
1068      * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1069      *
1070      * All of the MBCS fromUnicode specific tests could be removed from here,
1071      * but the ones above are for unusual mappings, and removing the tests
1072      * from here would change canonucm output which seems gratuitous.
1073      * (Markus Scherer 2006-nov-28)
1074      *
1075      * Exception: All implicit mappings (f<0) that need to be moved
1076      * because of fromUnicode restrictions _must_ be moved here because
1077      * makeconv uses a hack for moving mappings only for the fromUnicode table
1078      * that only works with non-negative values of f.
1079      */
1080     if( m->uLen==1 && count==1 && m->f<=3 &&
1081         (baseStates->maxCharLength==1 ||
1082             !((m->f==2 && m->bLen==1) ||
1083               (m->f==1 && bytes[0]==0) ||
1084               (m->f<=1 && m->bLen>1 && bytes[0]==0)))
1085     ) {
1086         return 0; /* suitable for a base table */
1087     } else {
1088         return 1; /* needs to go into an extension table */
1089     }
1090 }
1091 
1092 U_CAPI UBool U_EXPORT2
ucm_addMappingAuto(UCMFile * ucm,UBool forBase,UCMStates * baseStates,UCMapping * m,UChar32 codePoints[UCNV_EXT_MAX_UCHARS],uint8_t bytes[UCNV_EXT_MAX_BYTES])1093 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1094                    UCMapping *m,
1095                    UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1096                    uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1097     int32_t type;
1098 
1099     if(m->f==2 && m->uLen>1) {
1100         fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1101         printMapping(m, codePoints, bytes, stderr);
1102         return FALSE;
1103     }
1104 
1105     if(baseStates!=NULL) {
1106         /* check validity of the bytes and count the characters in them */
1107         type=ucm_mappingType(baseStates, m, codePoints, bytes);
1108         if(type<0) {
1109             /* illegal byte sequence */
1110             printMapping(m, codePoints, bytes, stderr);
1111             return FALSE;
1112         }
1113     } else {
1114         /* not used - adding a mapping for an extension-only table before its base table is read */
1115         type=1;
1116     }
1117 
1118     /*
1119      * Add the mapping to the base table if this is requested and suitable.
1120      * Otherwise, add it to the extension table.
1121      */
1122     if(forBase && type==0) {
1123         ucm_addMapping(ucm->base, m, codePoints, bytes);
1124     } else {
1125         ucm_addMapping(ucm->ext, m, codePoints, bytes);
1126     }
1127 
1128     return TRUE;
1129 }
1130 
1131 U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile * ucm,const char * line,UBool forBase,UCMStates * baseStates)1132 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1133   UCMapping m={ 0, {0}, 0, 0, 0, 0 };
1134     UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1135     uint8_t bytes[UCNV_EXT_MAX_BYTES];
1136 
1137     const char *s;
1138 
1139     /* ignore empty and comment lines */
1140     if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1141         return TRUE;
1142     }
1143 
1144     return
1145         ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1146         ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1147 }
1148 
1149 U_CAPI void U_EXPORT2
ucm_readTable(UCMFile * ucm,FileStream * convFile,UBool forBase,UCMStates * baseStates,UErrorCode * pErrorCode)1150 ucm_readTable(UCMFile *ucm, FileStream* convFile,
1151               UBool forBase, UCMStates *baseStates,
1152               UErrorCode *pErrorCode) {
1153     char line[500];
1154     char *end;
1155     UBool isOK;
1156 
1157     if(U_FAILURE(*pErrorCode)) {
1158         return;
1159     }
1160 
1161     isOK=TRUE;
1162 
1163     for(;;) {
1164         /* read the next line */
1165         if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1166             fprintf(stderr, "incomplete charmap section\n");
1167             isOK=FALSE;
1168             break;
1169         }
1170 
1171         /* remove CR LF */
1172         end=uprv_strchr(line, 0);
1173         while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1174             --end;
1175         }
1176         *end=0;
1177 
1178         /* ignore empty and comment lines */
1179         if(line[0]==0 || line[0]=='#') {
1180             continue;
1181         }
1182 
1183         /* stop at the end of the mapping table */
1184         if(0==uprv_strcmp(line, "END CHARMAP")) {
1185             break;
1186         }
1187 
1188         isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1189     }
1190 
1191     if(!isOK) {
1192         *pErrorCode=U_INVALID_TABLE_FORMAT;
1193     }
1194 }
1195 #endif
1196