1 /******************************************************************************
2 * $Id: generate_encoding_table.c 355b41831cd2685c85d1aabe5b95665a2c6e99b7 2019-06-19 17:07:04 +0200 Even Rouault $
3 *
4 * Project: OGR
5 * Purpose: Generate a mapping table from a 1-byte encoding to unicode,
6 * for ogr_expat.cpp
7 * Author: Even Rouault, even dot rouault at spatialys.com
8 *
9 ******************************************************************************
10 * Copyright (c) 2012, Even Rouault <even dot rouault at spatialys.com>
11 *
12 * Permission is hereby granted, free of charge, to any person obtaining a
13 * copy of this software and associated documentation files (the "Software"),
14 * to deal in the Software without restriction, including without limitation
15 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
16 * and/or sell copies of the Software, and to permit persons to whom the
17 * Software is furnished to do so, subject to the following conditions:
18 *
19 * The above copyright notice and this permission notice shall be included
20 * in all copies or substantial portions of the Software.
21 *
22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
23 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
25 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
28 * DEALINGS IN THE SOFTWARE.
29 ****************************************************************************/
30
31 #include <errno.h>
32 #include <iconv.h>
33 #include <stdio.h>
34 #include <stdlib.h>
35 #include <string.h>
36
utf8decode(const char * p,const char * end,int * len)37 static unsigned utf8decode(const char* p, const char* end, int* len)
38 {
39 unsigned char c = *(unsigned char*)p;
40 if (c < 0x80) {
41 *len = 1;
42 return c;
43 #if ERRORS_TO_CP1252
44 } else if (c < 0xa0) {
45 *len = 1;
46 return cp1252[c-0x80];
47 #endif
48 } else if (c < 0xc2) {
49 goto FAIL;
50 }
51 if (p+1 >= end || (p[1]&0xc0) != 0x80) goto FAIL;
52 if (c < 0xe0) {
53 *len = 2;
54 return
55 ((p[0] & 0x1f) << 6) +
56 ((p[1] & 0x3f));
57 } else if (c == 0xe0) {
58 if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
59 goto UTF8_3;
60 #if STRICT_RFC3629
61 } else if (c == 0xed) {
62 // RFC 3629 says surrogate chars are illegal.
63 if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
64 goto UTF8_3;
65 } else if (c == 0xef) {
66 // 0xfffe and 0xffff are also illegal characters
67 if (((unsigned char*)p)[1]==0xbf &&
68 ((unsigned char*)p)[2]>=0xbe) goto FAIL;
69 goto UTF8_3;
70 #endif
71 } else if (c < 0xf0) {
72 UTF8_3:
73 if (p+2 >= end || (p[2]&0xc0) != 0x80) goto FAIL;
74 *len = 3;
75 return
76 ((p[0] & 0x0f) << 12) +
77 ((p[1] & 0x3f) << 6) +
78 ((p[2] & 0x3f));
79 } else if (c == 0xf0) {
80 if (((unsigned char*)p)[1] < 0x90) goto FAIL;
81 goto UTF8_4;
82 } else if (c < 0xf4) {
83 UTF8_4:
84 if (p+3 >= end || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
85 *len = 4;
86 #if STRICT_RFC3629
87 // RFC 3629 says all codes ending in fffe or ffff are illegal:
88 if ((p[1]&0xf)==0xf &&
89 ((unsigned char*)p)[2] == 0xbf &&
90 ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
91 #endif
92 return
93 ((p[0] & 0x07) << 18) +
94 ((p[1] & 0x3f) << 12) +
95 ((p[2] & 0x3f) << 6) +
96 ((p[3] & 0x3f));
97 } else if (c == 0xf4) {
98 if (((unsigned char*)p)[1] > 0x8f) goto FAIL; // after 0x10ffff
99 goto UTF8_4;
100 } else {
101 FAIL:
102 *len = 1;
103 #if ERRORS_TO_ISO8859_1
104 return c;
105 #else
106 return 0xfffd; // Unicode REPLACEMENT CHARACTER
107 #endif
108 }
109 }
110
main(int argc,char * argv[])111 int main(int argc, char* argv[])
112 {
113 iconv_t sConv;
114 const char* pszSrcEncoding;
115 const char* pszDstEncoding = "UTF-8";
116 int i;
117 int nLastIdentical = -1;
118
119 if( argc != 2 )
120 {
121 fprintf(stderr, "Usage: generate_encoding_table encoding_name\n");
122 return 1;
123 }
124
125 pszSrcEncoding = argv[1];
126
127 sConv = iconv_open( pszDstEncoding, pszSrcEncoding );
128
129 if ( sConv == (iconv_t)-1 )
130 {
131 fprintf(stderr,
132 "Recode from %s to %s failed with the error: \"%s\".",
133 pszSrcEncoding, pszDstEncoding, strerror(errno) );
134 return 1;
135 }
136
137 for(i = 0; i < 256; i++)
138 {
139 char szSrcBuf[2] = {(char)i, 0};
140 char szDstBuf[5] = {0,0,0,0,0};
141 char *pszSrcBuf = szSrcBuf;
142 char *pszDstBuf = szDstBuf;
143 size_t nSrcLen = strlen( szSrcBuf );
144 size_t nDstLen = sizeof(szDstBuf);
145 size_t nConverted =
146 iconv( sConv, &pszSrcBuf, &nSrcLen, &pszDstBuf, &nDstLen );
147
148 int nUnicode = -1;
149 if( nConverted == -1 )
150 {
151 if ( errno == EILSEQ )
152 {
153 /* fprintf(stderr, "EILSEQ for %d\n", i); */
154 }
155
156 else if ( errno == E2BIG )
157 {
158 fprintf(stderr, "E2BIG for %d\n", i);
159 return 1;
160 }
161 else
162 {
163 fprintf(stderr, "other error for %d\n", i);
164 return 1;
165 }
166 }
167 else
168 {
169 int len;
170 nUnicode = utf8decode(szDstBuf, szDstBuf + strlen(szDstBuf), &len);
171 if( nUnicode == 0xfffd )
172 nUnicode = -1;
173 }
174
175 if( nLastIdentical >= 0 && i != nUnicode )
176 {
177 if( nLastIdentical + 1 == i )
178 printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical, nLastIdentical);
179 else
180 {
181 printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
182 printf(" info->map[i] = i;\n");
183 }
184 nLastIdentical = -1;
185 }
186
187 if( nUnicode < 0 )
188 printf("info->map[0x%02X] = -1;\n", i);
189 else if (nUnicode <= 0xFF )
190 {
191 if( i == nUnicode )
192 {
193 if( nLastIdentical < 0 )
194 nLastIdentical = i;
195 }
196 else
197 printf("info->map[0x%02X] = 0x%02X;\n", i, nUnicode);
198 }
199 else if (nUnicode <= 0xFFFF )
200 printf("info->map[0x%02X] = 0x%04X;\n", i, nUnicode);
201 else if (nUnicode <= 0xFFFFFF )
202 printf("info->map[0x%02X] = 0x%06X;\n", i, nUnicode);
203 else
204 printf("info->map[0x%02X] = 0x%08X;\n", i, nUnicode);
205 }
206
207 if( nLastIdentical >= 0 )
208 {
209 if( nLastIdentical + 1 == i )
210 printf("info->map[0x%02X] = 0x%02X;\n", nLastIdentical, nLastIdentical);
211 else
212 {
213 printf("for(i = 0x%02X; i < 0x%02X; i++)\n", nLastIdentical, i);
214 printf(" info->map[i] = i;\n");
215 }
216 }
217
218 iconv_close( sConv );
219
220 return 0;
221 }
222