1 /******************************************************************************
2  * $Id: generate_encoding_table.c 355b41831cd2685c85d1aabe5b95665a2c6e99b7 2019-06-19 17:07:04 +0200 Even Rouault $
3  *
4  * Project:  OGR
5  * Purpose:  Generate a mapping table from a 1-byte encoding to unicode,
6  *           for ogr_expat.cpp
7  * Author:   Even Rouault, even dot rouault at spatialys.com
8  *
9  ******************************************************************************
10  * Copyright (c) 2012, Even Rouault <even dot rouault at spatialys.com>
11  *
12  * Permission is hereby granted, free of charge, to any person obtaining a
13  * copy of this software and associated documentation files (the "Software"),
14  * to deal in the Software without restriction, including without limitation
15  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
16  * and/or sell copies of the Software, and to permit persons to whom the
17  * Software is furnished to do so, subject to the following conditions:
18  *
19  * The above copyright notice and this permission notice shall be included
20  * in all copies or substantial portions of the Software.
21  *
22  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
23  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
25  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
28  * DEALINGS IN THE SOFTWARE.
29  ****************************************************************************/
30 
31 #include <errno.h>
32 #include <iconv.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36 
utf8decode(const char * p,const char * end,int * len)37 static unsigned utf8decode(const char* p, const char* end, int* len)
38 {
39   unsigned char c = *(unsigned char*)p;
40   if (c < 0x80) {
41     *len = 1;
42     return c;
43 #if ERRORS_TO_CP1252
44   } else if (c < 0xa0) {
45     *len = 1;
46     return cp1252[c-0x80];
47 #endif
48   } else if (c < 0xc2) {
49     goto FAIL;
50   }
51   if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
52   if (c < 0xe0) {
53     *len = 2;
54     return
55       ((p[0] & 0x1f) << 6) +
56       ((p[1] & 0x3f));
57   } else if (c == 0xe0) {
58     if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
59     goto UTF8_3;
60 #if STRICT_RFC3629
61   } else if (c == 0xed) {
62     // RFC 3629 says surrogate chars are illegal.
63     if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
64     goto UTF8_3;
65   } else if (c == 0xef) {
66     // 0xfffe and 0xffff are also illegal characters
67     if (((unsigned char*)p)[1]==0xbf &&
68     ((unsigned char*)p)[2]>=0xbe) goto FAIL;
69     goto UTF8_3;
70 #endif
71   } else if (c < 0xf0) {
72   UTF8_3:
73     if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
74     *len = 3;
75     return
76       ((p[0] & 0x0f) << 12) +
77       ((p[1] & 0x3f) << 6) +
78       ((p[2] & 0x3f));
79   } else if (c == 0xf0) {
80     if (((unsigned char*)p)[1] < 0x90) goto FAIL;
81     goto UTF8_4;
82   } else if (c < 0xf4) {
83   UTF8_4:
84     if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
85     *len = 4;
86 #if STRICT_RFC3629
87     // RFC 3629 says all codes ending in fffe or ffff are illegal:
88     if ((p[1]&0xf)==0xf &&
89     ((unsigned char*)p)[2] == 0xbf &&
90     ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
91 #endif
92     return
93       ((p[0] & 0x07) << 18) +
94       ((p[1] & 0x3f) << 12) +
95       ((p[2] & 0x3f) << 6) +
96       ((p[3] & 0x3f));
97   } else if (c == 0xf4) {
98     if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
99     goto UTF8_4;
100   } else {
101   FAIL:
102     *len = 1;
103 #if ERRORS_TO_ISO8859_1
104     return c;
105 #else
106     return 0xfffd; // Unicode REPLACEMENT CHARACTER
107 #endif
108   }
109 }
110 
main(int argc,char * argv[])111 int main(int argc, char* argv[])
112 {
113     iconv_t sConv;
114     const char* pszSrcEncoding;
115     const char* pszDstEncoding = "UTF-8";
116     int i;
117     int nLastIdentical = -1;
118 
119     if( argc != 2 )
120     {
121         fprintf(stderr, "Usage: generate_encoding_table encoding_name\n");
122         return 1;
123     }
124 
125     pszSrcEncoding = argv[1];
126 
127     sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
128 
129     if ( sConv == (iconv_t)-1 )
130     {
131         fprintf(stderr,
132                   "Recode from %s to %s failed with the error: \"%s\".",
133                   pszSrcEncoding, pszDstEncoding, strerror(errno) );
134         return 1;
135     }
136 
137     for(i = 0; i < 256; i++)
138     {
139         char szSrcBuf[2] = {(char)i, 0};
140         char szDstBuf[5] = {0,0,0,0,0};
141         char *pszSrcBuf = szSrcBuf;
142         char *pszDstBuf = szDstBuf;
143         size_t  nSrcLen = strlen( szSrcBuf );
144         size_t  nDstLen = sizeof(szDstBuf);
145         size_t  nConverted =
146             iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
147 
148         int nUnicode = -1;
149         if( nConverted == -1 )
150         {
151             if ( errno == EILSEQ )
152             {
153                 /* fprintf(stderr, "EILSEQ for %d\n", i); */
154             }
155 
156             else if ( errno == E2BIG )
157             {
158                 fprintf(stderr, "E2BIG for %d\n", i);
159                 return 1;
160             }
161             else
162             {
163                 fprintf(stderr, "other error for %d\n", i);
164                 return 1;
165             }
166         }
167         else
168         {
169             int len;
170             nUnicode = utf8decode(szDstBuf, szDstBuf + strlen(szDstBuf), &len);
171             if( nUnicode == 0xfffd )
172                 nUnicode = -1;
173         }
174 
175         if( nLastIdentical >= 0 && i != nUnicode )
176         {
177             if( nLastIdentical + 1 == i )
178                 printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical, nLastIdentical);
179             else
180             {
181                 printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
182                 printf("    info->map[i] = i;\n");
183             }
184             nLastIdentical = -1;
185         }
186 
187         if( nUnicode < 0 )
188             printf("info->map[0x%02X] = -1;\n", i);
189         else if (nUnicode <= 0xFF )
190         {
191             if( i == nUnicode )
192             {
193                 if( nLastIdentical < 0 )
194                     nLastIdentical = i;
195             }
196             else
197                 printf("info->map[0x%02X] = 0x%02X;\n", i, nUnicode);
198         }
199         else if (nUnicode <= 0xFFFF )
200             printf("info->map[0x%02X] = 0x%04X;\n", i, nUnicode);
201         else if (nUnicode <= 0xFFFFFF )
202             printf("info->map[0x%02X] = 0x%06X;\n", i, nUnicode);
203         else
204             printf("info->map[0x%02X] = 0x%08X;\n", i, nUnicode);
205     }
206 
207     if( nLastIdentical >= 0 )
208     {
209         if( nLastIdentical + 1 == i )
210             printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical, nLastIdentical);
211         else
212         {
213             printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
214             printf("    info->map[i] = i;\n");
215         }
216     }
217 
218     iconv_close( sConv );
219 
220     return 0;
221 }
222