1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2003-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  gencnvex.c
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2003oct12
16 *   created by: Markus W. Scherer
17 */
18 
19 #include <stdio.h>
20 #include "unicode/utypes.h"
21 #include "unicode/ustring.h"
22 #include "cstring.h"
23 #include "cmemory.h"
24 #include "ucnv_cnv.h"
25 #include "ucnvmbcs.h"
26 #include "toolutil.h"
27 #include "unewdata.h"
28 #include "ucm.h"
29 #include "makeconv.h"
30 #include "genmbcs.h"
31 
32 static void
33 CnvExtClose(NewConverter *cnvData);
34 
35 static UBool
36 CnvExtIsValid(NewConverter *cnvData,
37               const uint8_t *bytes, int32_t length);
38 
39 static UBool
40 CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
41 
42 static uint32_t
43 CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
44             UNewDataMemory *pData, int32_t tableType);
45 
46 typedef struct CnvExtData {
47     NewConverter newConverter;
48 
49     UCMFile *ucm;
50 
51     /* toUnicode (state table in ucm->states) */
52     UToolMemory *toUTable, *toUUChars;
53 
54     /* fromUnicode */
55     UToolMemory *fromUTableUChars, *fromUTableValues, *fromUBytes;
56 
57     uint16_t stage1[MBCS_STAGE_1_SIZE];
58     uint16_t stage2[MBCS_STAGE_2_SIZE];
59     uint16_t stage3[0x10000<<UCNV_EXT_STAGE_2_LEFT_SHIFT]; /* 0x10000 because of 16-bit stage 2/3 indexes */
60     uint32_t stage3b[0x10000];
61 
62     int32_t stage1Top, stage2Top, stage3Top, stage3bTop;
63 
64     /* for stage3 compaction of <subchar1> |2 mappings */
65     uint16_t stage3Sub1Block;
66 
67     /* statistics */
68     int32_t
69         maxInBytes, maxOutBytes, maxBytesPerUChar,
70         maxInUChars, maxOutUChars, maxUCharsPerByte;
71 } CnvExtData;
72 
73 NewConverter *
CnvExtOpen(UCMFile * ucm)74 CnvExtOpen(UCMFile *ucm) {
75     CnvExtData *extData;
76 
77     extData=(CnvExtData *)uprv_malloc(sizeof(CnvExtData));
78     if(extData==NULL) {
79         printf("out of memory\n");
80         exit(U_MEMORY_ALLOCATION_ERROR);
81     }
82     uprv_memset(extData, 0, sizeof(CnvExtData));
83 
84     extData->ucm=ucm; /* aliased, not owned */
85 
86     extData->newConverter.close=CnvExtClose;
87     extData->newConverter.isValid=CnvExtIsValid;
88     extData->newConverter.addTable=CnvExtAddTable;
89     extData->newConverter.write=CnvExtWrite;
90     return &extData->newConverter;
91 }
92 
93 static void
CnvExtClose(NewConverter * cnvData)94 CnvExtClose(NewConverter *cnvData) {
95     CnvExtData *extData=(CnvExtData *)cnvData;
96     if(extData!=NULL) {
97         utm_close(extData->toUTable);
98         utm_close(extData->toUUChars);
99         utm_close(extData->fromUTableUChars);
100         utm_close(extData->fromUTableValues);
101         utm_close(extData->fromUBytes);
102         uprv_free(extData);
103     }
104 }
105 
106 /* we do not expect this to be called */
107 static UBool
CnvExtIsValid(NewConverter * cnvData,const uint8_t * bytes,int32_t length)108 CnvExtIsValid(NewConverter *cnvData,
109         const uint8_t *bytes, int32_t length) {
110     // suppress compiler warnings about unused variables
111     (void)cnvData;
112     (void)bytes;
113     (void)length;
114     return FALSE;
115 }
116 
117 static uint32_t
CnvExtWrite(NewConverter * cnvData,const UConverterStaticData * staticData,UNewDataMemory * pData,int32_t tableType)118 CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
119             UNewDataMemory *pData, int32_t tableType) {
120     (void) staticData; // suppress compiler warnings about unused variable
121     CnvExtData *extData=(CnvExtData *)cnvData;
122     int32_t length, top, headerSize;
123 
124     int32_t indexes[UCNV_EXT_INDEXES_MIN_LENGTH]={ 0 };
125 
126     if(tableType&TABLE_BASE) {
127         headerSize=0;
128     } else {
129         _MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
130 
131         /* write the header and base table name for an extension-only table */
132         length=(int32_t)uprv_strlen(extData->ucm->baseName)+1;
133         while(length&3) {
134             /* add padding */
135             extData->ucm->baseName[length++]=0;
136         }
137 
138         headerSize=MBCS_HEADER_V4_LENGTH*4+length;
139 
140         /* fill the header */
141         header.version[0]=4;
142         header.version[1]=2;
143         header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY);
144 
145         /* write the header and the base table name */
146         udata_writeBlock(pData, &header, MBCS_HEADER_V4_LENGTH*4);
147         udata_writeBlock(pData, extData->ucm->baseName, length);
148     }
149 
150     /* fill indexes[] - offsets/indexes are in units of the target array */
151     top=0;
152 
153     indexes[UCNV_EXT_INDEXES_LENGTH]=length=UCNV_EXT_INDEXES_MIN_LENGTH;
154     top+=length*4;
155 
156     indexes[UCNV_EXT_TO_U_INDEX]=top;
157     indexes[UCNV_EXT_TO_U_LENGTH]=length=utm_countItems(extData->toUTable);
158     top+=length*4;
159 
160     indexes[UCNV_EXT_TO_U_UCHARS_INDEX]=top;
161     indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]=length=utm_countItems(extData->toUUChars);
162     top+=length*2;
163 
164     indexes[UCNV_EXT_FROM_U_UCHARS_INDEX]=top;
165     length=utm_countItems(extData->fromUTableUChars);
166     top+=length*2;
167 
168     if(top&3) {
169         /* add padding */
170         *((UChar *)utm_alloc(extData->fromUTableUChars))=0;
171         *((uint32_t *)utm_alloc(extData->fromUTableValues))=0;
172         ++length;
173         top+=2;
174     }
175     indexes[UCNV_EXT_FROM_U_LENGTH]=length;
176 
177     indexes[UCNV_EXT_FROM_U_VALUES_INDEX]=top;
178     top+=length*4;
179 
180     indexes[UCNV_EXT_FROM_U_BYTES_INDEX]=top;
181     length=utm_countItems(extData->fromUBytes);
182     top+=length;
183 
184     if(top&1) {
185         /* add padding */
186         *((uint8_t *)utm_alloc(extData->fromUBytes))=0;
187         ++length;
188         ++top;
189     }
190     indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]=length;
191 
192     indexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]=top;
193     indexes[UCNV_EXT_FROM_U_STAGE_1_LENGTH]=length=extData->stage1Top;
194     indexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]=length+=extData->stage2Top;
195     top+=length*2;
196 
197     indexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]=top;
198     length=extData->stage3Top;
199     top+=length*2;
200 
201     if(top&3) {
202         /* add padding */
203         extData->stage3[extData->stage3Top++]=0;
204         ++length;
205         top+=2;
206     }
207     indexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]=length;
208 
209     indexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]=top;
210     indexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]=length=extData->stage3bTop;
211     top+=length*4;
212 
213     indexes[UCNV_EXT_SIZE]=top;
214 
215     /* statistics */
216     indexes[UCNV_EXT_COUNT_BYTES]=
217         (extData->maxInBytes<<16)|
218         (extData->maxOutBytes<<8)|
219         extData->maxBytesPerUChar;
220     indexes[UCNV_EXT_COUNT_UCHARS]=
221         (extData->maxInUChars<<16)|
222         (extData->maxOutUChars<<8)|
223         extData->maxUCharsPerByte;
224 
225     indexes[UCNV_EXT_FLAGS]=extData->ucm->ext->unicodeMask;
226 
227     /* write the extension data */
228     udata_writeBlock(pData, indexes, sizeof(indexes));
229     udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4);
230     udata_writeBlock(pData, utm_getStart(extData->toUUChars), indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]*2);
231 
232     udata_writeBlock(pData, utm_getStart(extData->fromUTableUChars), indexes[UCNV_EXT_FROM_U_LENGTH]*2);
233     udata_writeBlock(pData, utm_getStart(extData->fromUTableValues), indexes[UCNV_EXT_FROM_U_LENGTH]*4);
234     udata_writeBlock(pData, utm_getStart(extData->fromUBytes), indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]);
235 
236     udata_writeBlock(pData, extData->stage1, extData->stage1Top*2);
237     udata_writeBlock(pData, extData->stage2, extData->stage2Top*2);
238     udata_writeBlock(pData, extData->stage3, extData->stage3Top*2);
239     udata_writeBlock(pData, extData->stage3b, extData->stage3bTop*4);
240 
241 #if 0
242     {
243         int32_t i, j;
244 
245         length=extData->stage1Top;
246         printf("\nstage1[%x]:\n", length);
247 
248         for(i=0; i<length; ++i) {
249             if(extData->stage1[i]!=length) {
250                 printf("stage1[%04x]=%04x\n", i, extData->stage1[i]);
251             }
252         }
253 
254         j=length;
255         length=extData->stage2Top;
256         printf("\nstage2[%x]:\n", length);
257 
258         for(i=0; i<length; ++j, ++i) {
259             if(extData->stage2[i]!=0) {
260                 printf("stage12[%04x]=%04x\n", j, extData->stage2[i]);
261             }
262         }
263 
264         length=extData->stage3Top;
265         printf("\nstage3[%x]:\n", length);
266 
267         for(i=0; i<length; ++i) {
268             if(extData->stage3[i]!=0) {
269                 printf("stage3[%04x]=%04x\n", i, extData->stage3[i]);
270             }
271         }
272 
273         length=extData->stage3bTop;
274         printf("\nstage3b[%x]:\n", length);
275 
276         for(i=0; i<length; ++i) {
277             if(extData->stage3b[i]!=0) {
278                 printf("stage3b[%04x]=%08x\n", i, extData->stage3b[i]);
279             }
280         }
281     }
282 #endif
283 
284     if(VERBOSE) {
285         printf("size of extension data: %ld\n", (long)top);
286     }
287 
288     /* return the number of bytes that should have been written */
289     return (uint32_t)(headerSize+top);
290 }
291 
292 /* to Unicode --------------------------------------------------------------- */
293 
294 /*
295  * Remove fromUnicode fallbacks and SUB mappings which are irrelevant for
296  * the toUnicode table.
297  * This includes mappings with MBCS_FROM_U_EXT_FLAG which were suitable
298  * for the base toUnicode table but not for the base fromUnicode table.
299  * The table must be sorted.
300  * Modifies previous data in the reverseMap.
301  */
302 static int32_t
reduceToUMappings(UCMTable * table)303 reduceToUMappings(UCMTable *table) {
304     UCMapping *mappings;
305     int32_t *map;
306     int32_t i, j, count;
307     int8_t flag;
308 
309     mappings=table->mappings;
310     map=table->reverseMap;
311     count=table->mappingsLength;
312 
313     /* leave the map alone for the initial mappings with desired flags */
314     for(i=j=0; i<count; ++i) {
315         flag=mappings[map[i]].f;
316         if(flag!=0 && flag!=3) {
317             break;
318         }
319     }
320 
321     /* reduce from here to the rest */
322     for(j=i; i<count; ++i) {
323         flag=mappings[map[i]].f;
324         if(flag==0 || flag==3) {
325             map[j++]=map[i];
326         }
327     }
328 
329     return j;
330 }
331 
332 static uint32_t
getToUnicodeValue(CnvExtData * extData,UCMTable * table,UCMapping * m)333 getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
334     UChar32 *u32;
335     UChar *u;
336     uint32_t value;
337     int32_t u16Length, ratio;
338     UErrorCode errorCode;
339 
340     /* write the Unicode result code point or string index */
341     if(m->uLen==1) {
342         u16Length=U16_LENGTH(m->u);
343         value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u);
344     } else {
345         /* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */
346 
347         /* get the result code point string and its 16-bit string length */
348         u32=UCM_GET_CODE_POINTS(table, m);
349         errorCode=U_ZERO_ERROR;
350         u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode);
351         if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) {
352             exit(errorCode);
353         }
354 
355         /* allocate it and put its length and index into the value */
356         value=
357             (((uint32_t)u16Length+UCNV_EXT_TO_U_LENGTH_OFFSET)<<UCNV_EXT_TO_U_LENGTH_SHIFT)|
358             ((uint32_t)utm_countItems(extData->toUUChars));
359         u=utm_allocN(extData->toUUChars, u16Length);
360 
361         /* write the result 16-bit string */
362         errorCode=U_ZERO_ERROR;
363         u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode);
364         if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) {
365             exit(errorCode);
366         }
367     }
368     if(m->f==0) {
369         value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG;
370     }
371 
372     /* update statistics */
373     if(m->bLen>extData->maxInBytes) {
374         extData->maxInBytes=m->bLen;
375     }
376     if(u16Length>extData->maxOutUChars) {
377         extData->maxOutUChars=u16Length;
378     }
379 
380     ratio=(u16Length+(m->bLen-1))/m->bLen;
381     if(ratio>extData->maxUCharsPerByte) {
382         extData->maxUCharsPerByte=ratio;
383     }
384 
385     return value;
386 }
387 
388 /*
389  * Recursive toUTable generator core function.
390  * Preconditions:
391  * - start<limit (There is at least one mapping.)
392  * - The mappings are sorted lexically. (Access is through the reverseMap.)
393  * - All mappings between start and limit have input sequences that share
394  *   the same prefix of unitIndex length, and therefore all of these sequences
395  *   are at least unitIndex+1 long.
396  * - There are only relevant mappings available through the reverseMap,
397  *   see reduceToUMappings().
398  *
399  * One function invocation generates one section table.
400  *
401  * Steps:
402  * 1. Count the number of unique unit values and get the low/high unit values
403  *    that occur at unitIndex.
404  * 2. Allocate the section table with possible optimization for linear access.
405  * 3. Write temporary version of the section table with start indexes of
406  *    subsections, each corresponding to one unit value at unitIndex.
407  * 4. Iterate through the table once more, and depending on the subsection length:
408  *    0: write 0 as a result value (unused byte in linear-access section table)
409  *   >0: if there is one mapping with an input unit sequence of unitIndex+1
410  *       then defaultValue=compute the mapping result for this whole sequence
411  *       else defaultValue=0
412  *
413  *       recurse into the subsection
414  */
415 static UBool
generateToUTable(CnvExtData * extData,UCMTable * table,int32_t start,int32_t limit,int32_t unitIndex,uint32_t defaultValue)416 generateToUTable(CnvExtData *extData, UCMTable *table,
417                  int32_t start, int32_t limit, int32_t unitIndex,
418                  uint32_t defaultValue) {
419     UCMapping *mappings, *m;
420     int32_t *map;
421     int32_t i, j, uniqueCount, count, subStart, subLimit;
422 
423     uint8_t *bytes;
424     int32_t low, high, prev;
425 
426     uint32_t *section;
427 
428     mappings=table->mappings;
429     map=table->reverseMap;
430 
431     /* step 1: examine the input units; set low, high, uniqueCount */
432     m=mappings+map[start];
433     bytes=UCM_GET_BYTES(table, m);
434     low=bytes[unitIndex];
435     uniqueCount=1;
436 
437     prev=high=low;
438     for(i=start+1; i<limit; ++i) {
439         m=mappings+map[i];
440         bytes=UCM_GET_BYTES(table, m);
441         high=bytes[unitIndex];
442 
443         if(high!=prev) {
444             prev=high;
445             ++uniqueCount;
446         }
447     }
448 
449     /* step 2: allocate the section; set count, section */
450     count=(high-low)+1;
451     if(count<0x100 && (unitIndex==0 || uniqueCount>=(3*count)/4)) {
452         /*
453          * for the root table and for fairly full tables:
454          * allocate for direct, linear array access
455          * by keeping count, to write an entry for each unit value
456          * from low to high
457          * exception: use a compact table if count==0x100 because
458          * that cannot be encoded in the length byte
459          */
460     } else {
461         count=uniqueCount;
462     }
463 
464     if(count>=0x100) {
465         fprintf(stderr, "error: toUnicode extension table section overflow: %ld section entries\n", (long)count);
466         return FALSE;
467     }
468 
469     /* allocate the section: 1 entry for the header + count for the items */
470     section=(uint32_t *)utm_allocN(extData->toUTable, 1+count);
471 
472     /* write the section header */
473     *section++=((uint32_t)count<<UCNV_EXT_TO_U_BYTE_SHIFT)|defaultValue;
474 
475     /* step 3: write temporary section table with subsection starts */
476     prev=low-1; /* just before low to prevent empty subsections before low */
477     j=0; /* section table index */
478     for(i=start; i<limit; ++i) {
479         m=mappings+map[i];
480         bytes=UCM_GET_BYTES(table, m);
481         high=bytes[unitIndex];
482 
483         if(high!=prev) {
484             /* start of a new subsection for unit high */
485             if(count>uniqueCount) {
486                 /* write empty subsections for unused units in a linear table */
487                 while(++prev<high) {
488                     section[j++]=((uint32_t)prev<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i;
489                 }
490             } else {
491                 prev=high;
492             }
493 
494             /* write the entry with the subsection start */
495             section[j++]=((uint32_t)high<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i;
496         }
497     }
498     /* assert(j==count) */
499 
500     /* step 4: recurse and write results */
501     subLimit=UCNV_EXT_TO_U_GET_VALUE(section[0]);
502     for(j=0; j<count; ++j) {
503         subStart=subLimit;
504         subLimit= (j+1)<count ? UCNV_EXT_TO_U_GET_VALUE(section[j+1]) : limit;
505 
506         /* remove the subStart temporary value */
507         section[j]&=~UCNV_EXT_TO_U_VALUE_MASK;
508 
509         if(subStart==subLimit) {
510             /* leave the value zero: empty subsection for unused unit in a linear table */
511             continue;
512         }
513 
514         /* see if there is exactly one input unit sequence of length unitIndex+1 */
515         defaultValue=0;
516         m=mappings+map[subStart];
517         if(m->bLen==unitIndex+1) {
518             /* do not include this in generateToUTable() */
519             ++subStart;
520 
521             if(subStart<subLimit && mappings[map[subStart]].bLen==unitIndex+1) {
522                 /* print error for multiple same-input-sequence mappings */
523                 fprintf(stderr, "error: multiple mappings from same bytes\n");
524                 ucm_printMapping(table, m, stderr);
525                 ucm_printMapping(table, mappings+map[subStart], stderr);
526                 return FALSE;
527             }
528 
529             defaultValue=getToUnicodeValue(extData, table, m);
530         }
531 
532         if(subStart==subLimit) {
533             /* write the result for the input sequence ending here */
534             section[j]|=defaultValue;
535         } else {
536             /* write the index to the subsection table */
537             section[j]|=(uint32_t)utm_countItems(extData->toUTable);
538 
539             /* recurse */
540             if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) {
541                 return FALSE;
542             }
543         }
544     }
545     return TRUE;
546 }
547 
548 /*
549  * Generate the toUTable and toUUChars from the input table.
550  * The input table must be sorted, and all precision flags must be 0..3.
551  * This function will modify the table's reverseMap.
552  */
553 static UBool
makeToUTable(CnvExtData * extData,UCMTable * table)554 makeToUTable(CnvExtData *extData, UCMTable *table) {
555     int32_t toUCount;
556 
557     toUCount=reduceToUMappings(table);
558 
559     extData->toUTable=utm_open("cnv extension toUTable", 0x10000, UCNV_EXT_TO_U_MIN_CODE_POINT, 4);
560     extData->toUUChars=utm_open("cnv extension toUUChars", 0x10000, UCNV_EXT_TO_U_INDEX_MASK+1, 2);
561 
562     return generateToUTable(extData, table, 0, toUCount, 0, 0);
563 }
564 
565 /* from Unicode ------------------------------------------------------------- */
566 
567 /*
568  * preprocessing:
569  * rebuild reverseMap with mapping indexes for mappings relevant for from Unicode
570  * change each Unicode string to encode all but the first code point in 16-bit form
571  *
572  * generation:
573  * for each unique code point
574  *   write an entry in the 3-stage trie
575  *   check that there is only one single-code point sequence
576  *   start recursion for following 16-bit input units
577  */
578 
579 /*
580  * Remove toUnicode fallbacks and non-<subchar1> SUB mappings
581  * which are irrelevant for the fromUnicode extension table.
582  * Remove MBCS_FROM_U_EXT_FLAG bits.
583  * Overwrite the reverseMap with an index array to the relevant mappings.
584  * Modify the code point sequences to a generator-friendly format where
585  * the first code points remains unchanged but the following are recoded
586  * into 16-bit Unicode string form.
587  * The table must be sorted.
588  * Destroys previous data in the reverseMap.
589  */
590 static int32_t
prepareFromUMappings(UCMTable * table)591 prepareFromUMappings(UCMTable *table) {
592     UCMapping *mappings, *m;
593     int32_t *map;
594     int32_t i, j, count;
595     int8_t flag;
596 
597     mappings=table->mappings;
598     map=table->reverseMap;
599     count=table->mappingsLength;
600 
601     /*
602      * we do not go through the map on input because the mappings are
603      * sorted lexically
604      */
605     m=mappings;
606 
607     for(i=j=0; i<count; ++m, ++i) {
608         flag=m->f;
609         if(flag>=0) {
610             flag&=MBCS_FROM_U_EXT_MASK;
611             m->f=flag;
612         }
613         if(flag==0 || flag==1 || (flag==2 && m->bLen==1) || flag==4) {
614             map[j++]=i;
615 
616             if(m->uLen>1) {
617                 /* recode all but the first code point to 16-bit Unicode */
618                 UChar32 *u32;
619                 UChar *u;
620                 UChar32 c;
621                 int32_t q, r;
622 
623                 u32=UCM_GET_CODE_POINTS(table, m);
624                 u=(UChar *)u32; /* destructive in-place recoding */
625                 for(r=2, q=1; q<m->uLen; ++q) {
626                     c=u32[q];
627                     U16_APPEND_UNSAFE(u, r, c);
628                 }
629 
630                 /* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */
631                 m->uLen=(int8_t)r;
632             }
633         }
634     }
635 
636     return j;
637 }
638 
639 static uint32_t
getFromUBytesValue(CnvExtData * extData,UCMTable * table,UCMapping * m)640 getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
641     uint8_t *bytes, *resultBytes;
642     uint32_t value;
643     int32_t u16Length, ratio;
644 
645     if(m->f==2) {
646         /*
647          * no mapping, <subchar1> preferred
648          *
649          * no need to count in statistics because the subchars are already
650          * counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData,
651          * and this non-mapping does not count for maxInUChars which are always
652          * trivially at least two if counting unmappable supplementary code points
653          */
654         return UCNV_EXT_FROM_U_SUBCHAR1;
655     }
656 
657     bytes=UCM_GET_BYTES(table, m);
658     value=0;
659     switch(m->bLen) {
660         /* 1..3: store the bytes in the value word */
661     case 3:
662         value=((uint32_t)*bytes++)<<16;
663     case 2:
664         value|=((uint32_t)*bytes++)<<8;
665     case 1:
666         value|=*bytes;
667         break;
668     default:
669         /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */
670         /* store the bytes in fromUBytes[] and the index in the value word */
671         value=(uint32_t)utm_countItems(extData->fromUBytes);
672         resultBytes=utm_allocN(extData->fromUBytes, m->bLen);
673         uprv_memcpy(resultBytes, bytes, m->bLen);
674         break;
675     }
676     value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT;
677     if(m->f==0) {
678         value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG;
679     } else if(m->f==4) {
680         value|=UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG;
681     }
682 
683     /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */
684     if(m->uLen==1) {
685         u16Length=U16_LENGTH(m->u);
686     } else {
687         u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2);
688     }
689 
690     /* update statistics */
691     if(u16Length>extData->maxInUChars) {
692         extData->maxInUChars=u16Length;
693     }
694     if(m->bLen>extData->maxOutBytes) {
695         extData->maxOutBytes=m->bLen;
696     }
697 
698     ratio=(m->bLen+(u16Length-1))/u16Length;
699     if(ratio>extData->maxBytesPerUChar) {
700         extData->maxBytesPerUChar=ratio;
701     }
702 
703     return value;
704 }
705 
706 /*
707  * works like generateToUTable(), except that the
708  * output section consists of two arrays, one for input UChars and one
709  * for result values
710  *
711  * also, fromUTable sections are always stored in a compact form for
712  * access via binary search
713  */
714 static UBool
generateFromUTable(CnvExtData * extData,UCMTable * table,int32_t start,int32_t limit,int32_t unitIndex,uint32_t defaultValue)715 generateFromUTable(CnvExtData *extData, UCMTable *table,
716                    int32_t start, int32_t limit, int32_t unitIndex,
717                    uint32_t defaultValue) {
718     UCMapping *mappings, *m;
719     int32_t *map;
720     int32_t i, j, uniqueCount, count, subStart, subLimit;
721 
722     UChar *uchars;
723     UChar32 low, high, prev;
724 
725     UChar *sectionUChars;
726     uint32_t *sectionValues;
727 
728     mappings=table->mappings;
729     map=table->reverseMap;
730 
731     /* step 1: examine the input units; set low, high, uniqueCount */
732     m=mappings+map[start];
733     uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
734     low=uchars[unitIndex];
735     uniqueCount=1;
736 
737     prev=high=low;
738     for(i=start+1; i<limit; ++i) {
739         m=mappings+map[i];
740         uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
741         high=uchars[unitIndex];
742 
743         if(high!=prev) {
744             prev=high;
745             ++uniqueCount;
746         }
747     }
748 
749     /* step 2: allocate the section; set count, section */
750     /* the fromUTable always stores for access via binary search */
751     count=uniqueCount;
752 
753     /* allocate the section: 1 entry for the header + count for the items */
754     sectionUChars=(UChar *)utm_allocN(extData->fromUTableUChars, 1+count);
755     sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count);
756 
757     /* write the section header */
758     *sectionUChars++=(UChar)count;
759     *sectionValues++=defaultValue;
760 
761     /* step 3: write temporary section table with subsection starts */
762     prev=low-1; /* just before low to prevent empty subsections before low */
763     j=0; /* section table index */
764     for(i=start; i<limit; ++i) {
765         m=mappings+map[i];
766         uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
767         high=uchars[unitIndex];
768 
769         if(high!=prev) {
770             /* start of a new subsection for unit high */
771             prev=high;
772 
773             /* write the entry with the subsection start */
774             sectionUChars[j]=(UChar)high;
775             sectionValues[j]=(uint32_t)i;
776             ++j;
777         }
778     }
779     /* assert(j==count) */
780 
781     /* step 4: recurse and write results */
782     subLimit=(int32_t)(sectionValues[0]);
783     for(j=0; j<count; ++j) {
784         subStart=subLimit;
785         subLimit= (j+1)<count ? (int32_t)(sectionValues[j+1]) : limit;
786 
787         /* see if there is exactly one input unit sequence of length unitIndex+1 */
788         defaultValue=0;
789         m=mappings+map[subStart];
790         if(m->uLen==unitIndex+1) {
791             /* do not include this in generateToUTable() */
792             ++subStart;
793 
794             if(subStart<subLimit && mappings[map[subStart]].uLen==unitIndex+1) {
795                 /* print error for multiple same-input-sequence mappings */
796                 fprintf(stderr, "error: multiple mappings from same Unicode code points\n");
797                 ucm_printMapping(table, m, stderr);
798                 ucm_printMapping(table, mappings+map[subStart], stderr);
799                 return FALSE;
800             }
801 
802             defaultValue=getFromUBytesValue(extData, table, m);
803         }
804 
805         if(subStart==subLimit) {
806             /* write the result for the input sequence ending here */
807             sectionValues[j]=defaultValue;
808         } else {
809             /* write the index to the subsection table */
810             sectionValues[j]=(uint32_t)utm_countItems(extData->fromUTableValues);
811 
812             /* recurse */
813             if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) {
814                 return FALSE;
815             }
816         }
817     }
818     return TRUE;
819 }
820 
821 /*
822  * add entries to the fromUnicode trie,
823  * assume to be called with code points in ascending order
824  * and use that to build the trie in precompacted form
825  */
826 static void
addFromUTrieEntry(CnvExtData * extData,UChar32 c,uint32_t value)827 addFromUTrieEntry(CnvExtData *extData, UChar32 c, uint32_t value) {
828     int32_t i1, i2, i3, i3b, nextOffset, min, newBlock;
829 
830     if(value==0) {
831         return;
832     }
833 
834     /*
835      * compute the index for each stage,
836      * allocate a stage block if necessary,
837      * and write the stage value
838      */
839     i1=c>>10;
840     if(i1>=extData->stage1Top) {
841         extData->stage1Top=i1+1;
842     }
843 
844     nextOffset=(c>>4)&0x3f;
845 
846     if(extData->stage1[i1]==0) {
847         /* allocate another block in stage 2; overlap with the previous block */
848         newBlock=extData->stage2Top;
849         min=newBlock-nextOffset; /* minimum block start with overlap */
850         while(min<newBlock && extData->stage2[newBlock-1]==0) {
851             --newBlock;
852         }
853 
854         extData->stage1[i1]=(uint16_t)newBlock;
855         extData->stage2Top=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
856         if(extData->stage2Top>UPRV_LENGTHOF(extData->stage2)) {
857             fprintf(stderr, "error: too many stage 2 entries at U+%04x\n", (int)c);
858             exit(U_MEMORY_ALLOCATION_ERROR);
859         }
860     }
861 
862     i2=extData->stage1[i1]+nextOffset;
863     nextOffset=c&0xf;
864 
865     if(extData->stage2[i2]==0) {
866         /* allocate another block in stage 3; overlap with the previous block */
867         newBlock=extData->stage3Top;
868         min=newBlock-nextOffset; /* minimum block start with overlap */
869         while(min<newBlock && extData->stage3[newBlock-1]==0) {
870             --newBlock;
871         }
872 
873         /* round up to a multiple of stage 3 granularity >1 (similar to utrie.c) */
874         newBlock=(newBlock+(UCNV_EXT_STAGE_3_GRANULARITY-1))&~(UCNV_EXT_STAGE_3_GRANULARITY-1);
875         extData->stage2[i2]=(uint16_t)(newBlock>>UCNV_EXT_STAGE_2_LEFT_SHIFT);
876 
877         extData->stage3Top=newBlock+MBCS_STAGE_3_BLOCK_SIZE;
878         if(extData->stage3Top>UPRV_LENGTHOF(extData->stage3)) {
879             fprintf(stderr, "error: too many stage 3 entries at U+%04x\n", (int)c);
880             exit(U_MEMORY_ALLOCATION_ERROR);
881         }
882     }
883 
884     i3=((int32_t)extData->stage2[i2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)+nextOffset;
885     /*
886      * assume extData->stage3[i3]==0 because we get
887      * code points in strictly ascending order
888      */
889 
890     if(value==UCNV_EXT_FROM_U_SUBCHAR1) {
891         /* <subchar1> SUB mapping, see getFromUBytesValue() and prepareFromUMappings() */
892         extData->stage3[i3]=1;
893 
894         /*
895          * precompaction is not optimal for <subchar1> |2 mappings because
896          * stage3 values for them are all the same, unlike for other mappings
897          * which all have unique values;
898          * use a simple compaction of reusing a whole block filled with these
899          * mappings
900          */
901 
902         /* is the entire block filled with <subchar1> |2 mappings? */
903         if(nextOffset==MBCS_STAGE_3_BLOCK_SIZE-1) {
904             for(min=i3-nextOffset;
905                 min<i3 && extData->stage3[min]==1;
906                 ++min) {}
907 
908             if(min==i3) {
909                 /* the entire block is filled with these mappings */
910                 if(extData->stage3Sub1Block!=0) {
911                     /* point to the previous such block and remove this block from stage3 */
912                     extData->stage2[i2]=extData->stage3Sub1Block;
913                     extData->stage3Top-=MBCS_STAGE_3_BLOCK_SIZE;
914                     uprv_memset(extData->stage3+extData->stage3Top, 0, MBCS_STAGE_3_BLOCK_SIZE*2);
915                 } else {
916                     /* remember this block's stage2 entry */
917                     extData->stage3Sub1Block=extData->stage2[i2];
918                 }
919             }
920         }
921     } else {
922         if((i3b=extData->stage3bTop++)>=UPRV_LENGTHOF(extData->stage3b)) {
923             fprintf(stderr, "error: too many stage 3b entries at U+%04x\n", (int)c);
924             exit(U_MEMORY_ALLOCATION_ERROR);
925         }
926 
927         /* roundtrip or fallback mapping */
928         extData->stage3[i3]=(uint16_t)i3b;
929         extData->stage3b[i3b]=value;
930     }
931 }
932 
933 static UBool
generateFromUTrie(CnvExtData * extData,UCMTable * table,int32_t mapLength)934 generateFromUTrie(CnvExtData *extData, UCMTable *table, int32_t mapLength) {
935     UCMapping *mappings, *m;
936     int32_t *map;
937     uint32_t value;
938     int32_t subStart, subLimit;
939 
940     UChar32 *codePoints;
941     UChar32 c, next;
942 
943     if(mapLength==0) {
944         return TRUE;
945     }
946 
947     mappings=table->mappings;
948     map=table->reverseMap;
949 
950     /*
951      * iterate over same-initial-code point mappings,
952      * enter the initial code point into the trie,
953      * and start a recursion on the corresponding mappings section
954      * with generateFromUTable()
955      */
956     m=mappings+map[0];
957     codePoints=UCM_GET_CODE_POINTS(table, m);
958     next=codePoints[0];
959     subLimit=0;
960     while(subLimit<mapLength) {
961         /* get a new subsection of mappings starting with the same code point */
962         subStart=subLimit;
963         c=next;
964         while(next==c && ++subLimit<mapLength) {
965             m=mappings+map[subLimit];
966             codePoints=UCM_GET_CODE_POINTS(table, m);
967             next=codePoints[0];
968         }
969 
970         /*
971          * compute the value for this code point;
972          * if there is a mapping for this code point alone, it is at subStart
973          * because the table is sorted lexically
974          */
975         value=0;
976         m=mappings+map[subStart];
977         codePoints=UCM_GET_CODE_POINTS(table, m);
978         if(m->uLen==1) {
979             /* do not include this in generateFromUTable() */
980             ++subStart;
981 
982             if(subStart<subLimit && mappings[map[subStart]].uLen==1) {
983                 /* print error for multiple same-input-sequence mappings */
984                 fprintf(stderr, "error: multiple mappings from same Unicode code points\n");
985                 ucm_printMapping(table, m, stderr);
986                 ucm_printMapping(table, mappings+map[subStart], stderr);
987                 return FALSE;
988             }
989 
990             value=getFromUBytesValue(extData, table, m);
991         }
992 
993         if(subStart==subLimit) {
994             /* write the result for this one code point */
995             addFromUTrieEntry(extData, c, value);
996         } else {
997             /* write the index to the subsection table */
998             addFromUTrieEntry(extData, c, (uint32_t)utm_countItems(extData->fromUTableValues));
999 
1000             /* recurse, starting from 16-bit-unit index 2, the first 16-bit unit after c */
1001             if(!generateFromUTable(extData, table, subStart, subLimit, 2, value)) {
1002                 return FALSE;
1003             }
1004         }
1005     }
1006     return TRUE;
1007 }
1008 
1009 /*
1010  * Generate the fromU data structures from the input table.
1011  * The input table must be sorted, and all precision flags must be 0..3.
1012  * This function will modify the table's reverseMap.
1013  */
1014 static UBool
makeFromUTable(CnvExtData * extData,UCMTable * table)1015 makeFromUTable(CnvExtData *extData, UCMTable *table) {
1016     uint16_t *stage1;
1017     int32_t i, stage1Top, fromUCount;
1018 
1019     fromUCount=prepareFromUMappings(table);
1020 
1021     extData->fromUTableUChars=utm_open("cnv extension fromUTableUChars", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 2);
1022     extData->fromUTableValues=utm_open("cnv extension fromUTableValues", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 4);
1023     extData->fromUBytes=utm_open("cnv extension fromUBytes", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 1);
1024 
1025     /* allocate all-unassigned stage blocks */
1026     extData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED;
1027     extData->stage3Top=MBCS_STAGE_3_FIRST_ASSIGNED;
1028 
1029     /*
1030      * stage 3b stores only unique values, and in
1031      * index 0: 0 for "no mapping"
1032      * index 1: "no mapping" with preference for <subchar1> rather than <subchar>
1033      */
1034     extData->stage3b[1]=UCNV_EXT_FROM_U_SUBCHAR1;
1035     extData->stage3bTop=2;
1036 
1037     /* allocate the first entry in the fromUTable because index 0 means "no result" */
1038     utm_alloc(extData->fromUTableUChars);
1039     utm_alloc(extData->fromUTableValues);
1040 
1041     if(!generateFromUTrie(extData, table, fromUCount)) {
1042         return FALSE;
1043     }
1044 
1045     /*
1046      * offset the stage 1 trie entries by stage1Top because they will
1047      * be stored in a single array
1048      */
1049     stage1=extData->stage1;
1050     stage1Top=extData->stage1Top;
1051     for(i=0; i<stage1Top; ++i) {
1052         stage1[i]=(uint16_t)(stage1[i]+stage1Top);
1053     }
1054 
1055     return TRUE;
1056 }
1057 
1058 /* -------------------------------------------------------------------------- */
1059 
1060 static UBool
CnvExtAddTable(NewConverter * cnvData,UCMTable * table,UConverterStaticData * staticData)1061 CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
1062     CnvExtData *extData;
1063 
1064     if(table->unicodeMask&UCNV_HAS_SURROGATES) {
1065         fprintf(stderr, "error: contains mappings for surrogate code points\n");
1066         return FALSE;
1067     }
1068 
1069     staticData->conversionType=UCNV_MBCS;
1070 
1071     extData=(CnvExtData *)cnvData;
1072 
1073     /*
1074      * assume that the table is sorted
1075      *
1076      * call the functions in this order because
1077      * makeToUTable() modifies the original reverseMap,
1078      * makeFromUTable() writes a whole new mapping into reverseMap
1079      */
1080     return
1081         makeToUTable(extData, table) &&
1082         makeFromUTable(extData, table);
1083 }
1084