1 /*
2 * Copyright (c) 2000-2010, Yandex
3 *
4 * This file is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This file is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser Public License for more details.
13 * You should have received a copy of the GNU Lesser Public License
14 * along with Pire. If not, see <http://www.gnu.org/licenses>.
15 */
16
17
18 #ifndef PIRE_STUB_CODEPAGE_H_H
19 #define PIRE_STUB_CODEPAGE_H_H
20
21 struct CodePage;
22 struct Recoder;
23 struct Encoder;
24
25 /*****************************************************************\
26 * struct CodePage *
27 \*****************************************************************/
28 struct CodePage {
29 docCodes CPEnum; // int MIBEnum;
30 const char *Names[30]; // name[0] -- preferred mime-name
31 wchar32 unicode[256];
32 const char *DefaultChar; //[CCL_NUM]
33
is_lowerCodePage34 bool is_lower(int ch) const {return ch>=0 && Pire::is_lower(unicode[(unsigned char)ch]);}
is_upperCodePage35 bool is_upper(int ch) const {return ch>=0 && Pire::is_upper(unicode[(unsigned char)ch]);}
is_alphaCodePage36 bool is_alpha(int ch) const {return ch>=0 && Pire::is_alpha(unicode[(unsigned char)ch]);}
is_digitCodePage37 bool is_digit(int ch) const {return ch>=0 && Pire::is_digit(unicode[(unsigned char)ch]);}
is_xdigitCodePage38 bool is_xdigit(int ch)const {return ch>=0 && Pire::is_xdigit(unicode[(unsigned char)ch]);}
is_alnumCodePage39 bool is_alnum(int ch) const {return ch>=0 && Pire::is_alnum(unicode[(unsigned char)ch]);}
is_spaceCodePage40 bool is_space(int ch) const {return ch>=0 && Pire::is_space(unicode[(unsigned char)ch]);}
is_punctCodePage41 bool is_punct(int ch) const {return ch>=0 && Pire::is_punct(unicode[(unsigned char)ch]);}
is_cntrlCodePage42 bool is_cntrl(int ch) const {return ch>=0 && Pire::is_cntrl(unicode[(unsigned char)ch]);}
is_graphCodePage43 bool is_graph(int ch) const {return ch>=0 && Pire::is_graph(unicode[(unsigned char)ch]);}
is_printCodePage44 bool is_print(int ch) const {return ch>=0 && Pire::is_print(unicode[(unsigned char)ch]);}
45 // non-standard
is_composedCodePage46 bool is_composed(int ch) const {return ch>=0 && Pire::is_composed(unicode[(unsigned char)ch]);}
47
48 char *strlwr(char *in_out, size_t len = (unsigned)(-1)) const;
49 char *strupr(char *in_out, size_t len = (unsigned)(-1)) const;
50 char *strlwr(const char *in, char *out, size_t len = (unsigned)(-1)) const;
51 char *strupr(const char *in, char *out, size_t len = (unsigned)(-1)) const;
52 int stricmp(const char* s1, const char *s2) const;
53 int strnicmp(const char* s1, const char *s2, size_t len) const;
54
55 unsigned char to_upper(unsigned char ch) const;
56 unsigned char to_lower(unsigned char ch) const;
57 unsigned char to_title(unsigned char ch) const;
58 int to_digit(unsigned char ch) const;
59
60 static void Initialize();
61 };
62
63 const CodePage *CodePageByName(const char *name);
64
65 namespace NCodepagePrivate {
66 class TCodepagesMap {
67 private:
68 const CodePage* Data[CODES_MAX];
69
70 public:
71 TCodepagesMap();
72
Get(docCodes e)73 const CodePage* Get(docCodes e) {
74 Y_ASSERT(CODES_UNKNOWN < e && e < CODES_MAX);
75 return Data[e];
76 }
77 };
78 }
79
CodePageByDocCode(docCodes e)80 const CodePage *CodePageByDocCode(docCodes e)
81 {
82 return Singleton<NCodepagePrivate::TCodepagesMap>()->Get(e);
83 }
84
DocCodeByName(const char * name)85 docCodes DocCodeByName(const char *name)
86 {
87 const CodePage *CP = CodePageByName(name);
88 if (CP == 0)
89 return CODES_UNKNOWN;
90 return CP->CPEnum;
91 }
DocCodeByCodePage(const CodePage * CP)92 docCodes DocCodeByCodePage(const CodePage *CP)
93 {
94 return CP->CPEnum;
95 }
NameByDocCode(docCodes e)96 const char *NameByDocCode(docCodes e)
97 {
98 return CodePageByDocCode(e)->Names[0];
99 }
NameByCodePage(const CodePage * CP)100 const char *NameByCodePage(const CodePage *CP)
101 {
102 return CP->Names[0];
103 }
104
105 docCodes EncodingHintByName(const char* name);
106
107 /*****************************************************************\
108 * struct Encoder *
109 \*****************************************************************/
110 enum RECODE_RESULT{
111 RECODE_OK,
112 RECODE_EOINPUT,
113 RECODE_EOOUTPUT,
114 RECODE_BROKENSYMBOL,
115 RECODE_ERROR,
116 RECODE_DEFAULTSYMBOL,
117 };
118
119 struct Encoder {
120 char *Table[256];
121 const char *DefaultChar;
122
CodeEncoder123 char Code(wchar32 ch) const
124 {
125 if (ch > 0xFFFF)
126 return 0;
127 return (unsigned char)Table[(ch>>8)&255][ch&255];
128 }
TrEncoder129 char Tr(wchar32 ch) const
130 {
131 char code = Code(ch);
132 if (code == 0 && ch != 0)
133 code = DefaultChar[wc_type(ch)];
134 Y_ASSERT(code != 0 || ch == 0);
135 return code;
136 }
137
138 unsigned char operator [](wchar32 ch) const
139 {
140 return Tr(ch);
141 }
142 void Tr(const wchar32 *in, char *out, size_t len) const;
143 void Tr(const wchar32 *in, char *out) const;
144 char * DefaultPlane;
145 };
146
147 struct CustomEncoder : public Encoder {
148 void Create (const CodePage *target, int mode=1);
149 void Free ();
150 private:
151 void addToTable(wchar32 ucode, unsigned char code, const CodePage* target);
152 };
153
154 struct MultipleEncMapping {
155 typedef ui32 maptype;
156 static maptype DefaultPlane[256];
157 maptype *Table[256];
158 MultipleEncMapping();
GetEncodingsMultipleEncMapping159 maptype GetEncodings(wchar32 ch) const {
160 return Table[(ch>>8)&255][ch&255];
161 }
162 void ImportEncoder(const Encoder &E, int enc);
163 ~MultipleEncMapping();
164
165 DECLARE_NOCOPY(MultipleEncMapping);
166 };
167
168 /*****************************************************************\
169 * struct Recoder *
170 \*****************************************************************/
171 struct Recoder {
172 unsigned char Table[257];
173
174 void Create(const CodePage &source, const CodePage &target);
175 void Create(const CodePage &source, const Encoder* wideTarget);
176
177 void Create(const CodePage &page, wchar32 (*mapper)(wchar32));
178 void Create(const CodePage &page, const Encoder* widePage, wchar32 (*mapper)(wchar32));
179
TrRecoder180 unsigned char Tr(unsigned char c) const
181 {
182 return Table[c];
183 }
184 unsigned char operator [](unsigned char c) const
185 {
186 return Table[c];
187 }
188 void Tr(const char *in, char *out, size_t len) const;
189 void Tr(const char *in, char *out) const;
190 void Tr(char *in_out, size_t len) const;
191 void Tr(char *in_out) const;
192 };
193
194 extern const Encoder *EncodeTo[CODES_MAX];
195
196 extern const struct Encoder &WideCharToYandex;
197
198 extern const Recoder rcdr_to_yandex[];
199 extern const Recoder rcdr_from_yandex[];
200
201 extern const Recoder rcdr_to_lower[];
202 extern const Recoder rcdr_to_upper[];
203 extern const Recoder rcdr_to_title[];
204
to_upper(unsigned char ch)205 unsigned char CodePage::to_upper(unsigned char ch) const
206 {
207 return rcdr_to_upper[CPEnum].Table[ch];
208 }
to_lower(unsigned char ch)209 unsigned char CodePage::to_lower(unsigned char ch) const
210 {
211 return rcdr_to_lower[CPEnum].Table[ch];
212 }
to_title(unsigned char ch)213 unsigned char CodePage::to_title(unsigned char ch) const
214 {
215 return rcdr_to_title[CPEnum].Table[ch];
216 }
to_digit(unsigned char ch)217 int CodePage::to_digit(unsigned char ch) const
218 {
219 return Pire::to_digit(unicode[ch]);
220 }
221
222 extern const struct CodePage csYandex;
223
224 const unsigned char yaUNK_Up = 0xA6;
225 const unsigned char yaUNK_Lo = 0xB6;
226 const unsigned char yaIDEOGR = 0x9F;
227 const unsigned char yaSHY = 0x8F;
228 const unsigned char yaACUTE = 0x80;
229 const unsigned char yaGradus = 0xB0;
230
utf8_leadbyte_mask(size_t len)231 unsigned char utf8_leadbyte_mask(size_t len) {
232 // Y_ASSERT (len <= 4);
233 return "\0\0\037\017\007"[len];
234 }
235
utf8_rune_len(const unsigned char p)236 size_t utf8_rune_len(const unsigned char p)
237 {
238 return "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4\0"[p>>3];
239 }
240
utf8_rune_len_by_ucs(wchar32 rune)241 size_t utf8_rune_len_by_ucs(wchar32 rune)
242 {
243 if (rune < 0x80)
244 return 1U;
245 else if (rune < 0x800)
246 return 2U;
247 else if (rune < 0x10000)
248 return 3U;
249 else if (rune < 0x200000)
250 return 4U;
251 else if (rune < 0x4000000)
252 return 5U;
253 else
254 return 6U;
255 }
256
257 extern const wchar32 BROKEN_RUNE;
258
utf8_read_rune(wchar32 & rune,size_t & rune_len,const unsigned char * s,const unsigned char * end)259 RECODE_RESULT utf8_read_rune(wchar32 &rune, size_t &rune_len, const unsigned char *s, const unsigned char *end)
260 {
261 rune = BROKEN_RUNE;
262 rune_len = 0;
263 wchar32 _rune;
264
265 size_t _len = utf8_rune_len(*s);
266 if (s + _len > end) return RECODE_EOINPUT; //[EOINPUT]
267 if (_len==0) return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
268 _rune = *s++; //[00000000 0XXXXXXX]
269
270 if (_len > 1) {
271 _rune &= utf8_leadbyte_mask(_len);
272 unsigned char ch = *s++;
273 if ((ch & 0xC0) != 0x80)
274 return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
275 _rune <<= 6;
276 _rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
277 if (_len > 2) {
278 ch = *s++;
279 if ((ch & 0xC0) != 0x80)
280 return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
281 _rune <<= 6;
282 _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
283 if (_len > 3) {
284 ch = *s;
285 if ((ch & 0xC0) != 0x80)
286 return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
287 _rune <<= 6;
288 _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
289 }
290 }
291 }
292 rune_len = _len;
293 rune = _rune;
294 return RECODE_OK;
295 }
296
utf8_put_rune(wchar32 rune,size_t & rune_len,unsigned char * s,const unsigned char * end)297 RECODE_RESULT utf8_put_rune(wchar32 rune, size_t &rune_len, unsigned char *s, const unsigned char *end){
298 rune_len = 0;
299 size_t tail = end - s;
300 if (rune < 0x80){
301 if (tail <= 0) return RECODE_EOOUTPUT;
302 *s = (unsigned char)rune;
303 rune_len = 1;
304 return RECODE_OK;
305 }
306 if (rune < 0x800){
307 if (tail <= 1) return RECODE_EOOUTPUT;
308 *s++ = (unsigned char)(0xC0 | (rune >> 6));
309 *s = (unsigned char)(0x80 | (rune & 0x3F));
310 rune_len = 2;
311 return RECODE_OK;
312 }
313 if (rune < 0x10000) {
314 if (tail <= 2) return RECODE_EOOUTPUT;
315 *s++ = (unsigned char)(0xE0 | (rune >> 12));
316 *s++ = (unsigned char)(0x80 | ((rune >> 6) & 0x3F));
317 *s = (unsigned char)(0x80 | (rune & 0x3F));
318 rune_len = 3;
319 return RECODE_OK;
320 }
321 /*if (rune < 0x200000)*/ {
322 if (tail <= 3) return RECODE_EOOUTPUT;
323 *s++ = (unsigned char)(0xF0 | ((rune >> 18) & 0x07));
324 *s++ = (unsigned char)(0x80 | ((rune >> 12) & 0x3F));
325 *s++ = (unsigned char)(0x80 | ((rune >> 6) & 0x3F));
326 *s = (unsigned char)(0x80 | (rune & 0x3F));
327 rune_len = 4;
328 return RECODE_OK;
329 }
330 };
331
utf8_read_rune_from_unknown_plane(wchar32 & rune,size_t & rune_len,const wchar32 * s,const wchar32 * end)332 RECODE_RESULT utf8_read_rune_from_unknown_plane(wchar32 &rune, size_t &rune_len, const wchar32 *s, const wchar32 *end) {
333 if ((*s & 0xFF00) != 0xF000) {
334 rune_len = 1;
335 rune = *s;
336 return RECODE_OK;
337 }
338
339 rune_len = 0;
340
341 size_t _len = utf8_rune_len((unsigned char)(*s));
342 if (s + _len > end) return RECODE_EOINPUT; //[EOINPUT]
343 if (_len == 0) return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in first byte
344
345 wchar32 _rune = (ui8)(*s++); //[00000000 0XXXXXXX]
346 if (_len > 1) {
347 _rune &= utf8_leadbyte_mask(_len);
348 wchar32 ch = *s++;
349 if ((ch & 0xFFC0) != 0xF080)
350 return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in second byte
351 _rune <<= 6;
352 _rune |= ch & 0x3F; //[00000XXX XXYYYYYY]
353 if (_len > 2) {
354 ch = *s++;
355 if ((ch & 0xFFC0) != 0xF080)
356 return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in third byte
357 _rune <<= 6;
358 _rune |= ch & 0x3F; //[XXXXYYYY YYZZZZZZ]
359 if (_len > 3) {
360 ch = *s;
361 if ((ch & 0xFFC0) != 0xF080)
362 return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
363 _rune <<= 6;
364 _rune |= ch & 0x3F; //[XXXYY YYYYZZZZ ZZQQQQQQ]
365 }
366 }
367 }
368 rune_len = _len;
369 rune = _rune;
370 return RECODE_OK;
371 }
372
373 /// this function changes (lowers) [end] position in case of utf-8
374 /// null character is NOT assumed or written at [*end]
375 void DecodeUnknownPlane(wchar32 *start, wchar32 *&end, const docCodes enc4unk);
376
377 /// this function may return less than [len] bytes in case of utf-8
378 /// [dst] buffer must have at least [len] bytes
379 /// [dst] is NOT terminated with null character
380 size_t DecodeUnknownAndRecodeToYandex(const wchar32 *src, char *dst, size_t len, const docCodes enc4unk);
381
382 void ToLower(char* s, size_t n, const CodePage& cp = csYandex) {
383 char* const e = s + n;
384 for (; s != e; ++s)
385 *s = cp.to_lower(*s);
386 }
387
388 void ToUpper(char* s, size_t n, const CodePage& cp = csYandex) {
389 char* const e = s + n;
390 for (; s != e; ++s)
391 *s = cp.to_upper(*s);
392 }
393
394 #endif
395