1 /*
2  * Copyright (c) 2000-2010, Yandex
3  *
4  * This file is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU Lesser Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This file is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU Lesser Public License for more details.
13  * You should have received a copy of the GNU Lesser Public License
14  * along with Pire.  If not, see <http://www.gnu.org/licenses>.
15  */
16 
17 
18 #ifndef PIRE_STUB_CODEPAGE_H_H
19 #define PIRE_STUB_CODEPAGE_H_H
20 
21 struct CodePage;
22 struct Recoder;
23 struct Encoder;
24 
25 /*****************************************************************\
26  *                    struct CodePage                              *
27 \*****************************************************************/
28 struct CodePage {
29 	docCodes    CPEnum;       // int MIBEnum;
30 	const char *Names[30];    // name[0] -- preferred mime-name
31 	wchar32     unicode[256];
32 	const char *DefaultChar;  //[CCL_NUM]
33 
is_lowerCodePage34 	bool is_lower(int ch) const {return ch>=0 && Pire::is_lower(unicode[(unsigned char)ch]);}
is_upperCodePage35 	bool is_upper(int ch) const {return ch>=0 && Pire::is_upper(unicode[(unsigned char)ch]);}
is_alphaCodePage36 	bool is_alpha(int ch) const {return ch>=0 && Pire::is_alpha(unicode[(unsigned char)ch]);}
is_digitCodePage37 	bool is_digit(int ch) const {return ch>=0 && Pire::is_digit(unicode[(unsigned char)ch]);}
is_xdigitCodePage38 	bool is_xdigit(int ch)const {return ch>=0 && Pire::is_xdigit(unicode[(unsigned char)ch]);}
is_alnumCodePage39 	bool is_alnum(int ch) const {return ch>=0 && Pire::is_alnum(unicode[(unsigned char)ch]);}
is_spaceCodePage40 	bool is_space(int ch) const {return ch>=0 && Pire::is_space(unicode[(unsigned char)ch]);}
is_punctCodePage41 	bool is_punct(int ch) const {return ch>=0 && Pire::is_punct(unicode[(unsigned char)ch]);}
is_cntrlCodePage42 	bool is_cntrl(int ch) const {return ch>=0 && Pire::is_cntrl(unicode[(unsigned char)ch]);}
is_graphCodePage43 	bool is_graph(int ch) const {return ch>=0 && Pire::is_graph(unicode[(unsigned char)ch]);}
is_printCodePage44 	bool is_print(int ch) const {return ch>=0 && Pire::is_print(unicode[(unsigned char)ch]);}
45 	// non-standard
is_composedCodePage46 	bool is_composed(int ch) const {return ch>=0 && Pire::is_composed(unicode[(unsigned char)ch]);}
47 
48 	char *strlwr(char *in_out, size_t len = (unsigned)(-1)) const;
49 	char *strupr(char *in_out, size_t len = (unsigned)(-1)) const;
50 	char *strlwr(const char *in, char *out, size_t len = (unsigned)(-1)) const;
51 	char *strupr(const char *in, char *out, size_t len = (unsigned)(-1)) const;
52 	int   stricmp(const char* s1, const char *s2) const;
53 	int   strnicmp(const char* s1, const char *s2, size_t len) const;
54 
55 	unsigned char to_upper(unsigned char ch) const;
56 	unsigned char to_lower(unsigned char ch) const;
57 	unsigned char to_title(unsigned char ch) const;
58 	int           to_digit(unsigned char ch) const;
59 
60 	static void Initialize();
61 };
62 
63 const CodePage *CodePageByName(const char *name);
64 
65 namespace NCodepagePrivate {
66 	class TCodepagesMap {
67 	private:
68 		const CodePage* Data[CODES_MAX];
69 
70 	public:
71 		TCodepagesMap();
72 
Get(docCodes e)73 		const CodePage* Get(docCodes e) {
74 			Y_ASSERT(CODES_UNKNOWN < e && e < CODES_MAX);
75 			return Data[e];
76 		}
77 	};
78 }
79 
CodePageByDocCode(docCodes e)80 const CodePage *CodePageByDocCode(docCodes e)
81 {
82 	return Singleton<NCodepagePrivate::TCodepagesMap>()->Get(e);
83 }
84 
DocCodeByName(const char * name)85 docCodes DocCodeByName(const char *name)
86 {
87 	const CodePage *CP = CodePageByName(name);
88 	if (CP == 0)
89 		return CODES_UNKNOWN;
90 	return CP->CPEnum;
91 }
DocCodeByCodePage(const CodePage * CP)92 docCodes DocCodeByCodePage(const CodePage *CP)
93 {
94 	return CP->CPEnum;
95 }
NameByDocCode(docCodes e)96 const char *NameByDocCode(docCodes e)
97 {
98 	return CodePageByDocCode(e)->Names[0];
99 }
NameByCodePage(const CodePage * CP)100 const char *NameByCodePage(const CodePage *CP)
101 {
102 	return CP->Names[0];
103 }
104 
105 docCodes EncodingHintByName(const char* name);
106 
107 /*****************************************************************\
108  *                    struct Encoder                               *
109 \*****************************************************************/
110 enum RECODE_RESULT{
111 	RECODE_OK,
112 	RECODE_EOINPUT,
113 	RECODE_EOOUTPUT,
114 	RECODE_BROKENSYMBOL,
115 	RECODE_ERROR,
116 	RECODE_DEFAULTSYMBOL,
117 };
118 
119 struct Encoder {
120 	char *Table[256];
121 	const  char *DefaultChar;
122 
CodeEncoder123 	char Code(wchar32 ch) const
124 	{
125 		if (ch > 0xFFFF)
126 			return 0;
127 		return (unsigned char)Table[(ch>>8)&255][ch&255];
128 	}
TrEncoder129 	char Tr(wchar32 ch) const
130 	{
131 		char code = Code(ch);
132 		if (code == 0 && ch != 0)
133 			code =  DefaultChar[wc_type(ch)];
134 		Y_ASSERT(code != 0 || ch == 0);
135 		return code;
136 	}
137 
138 	unsigned char operator [](wchar32 ch) const
139 	{
140 		return Tr(ch);
141 	}
142 	void Tr(const wchar32 *in, char *out, size_t len) const;
143 	void Tr(const wchar32 *in, char *out) const;
144 	char * DefaultPlane;
145 };
146 
147 struct CustomEncoder : public Encoder {
148 	void Create (const CodePage *target, int mode=1);
149 	void Free ();
150 private:
151 	void addToTable(wchar32 ucode, unsigned char code, const CodePage* target);
152 };
153 
154 struct MultipleEncMapping {
155 	typedef ui32 maptype;
156 	static maptype DefaultPlane[256];
157 	maptype *Table[256];
158 	MultipleEncMapping();
GetEncodingsMultipleEncMapping159 	maptype GetEncodings(wchar32 ch) const {
160 		return Table[(ch>>8)&255][ch&255];
161 	}
162 	void ImportEncoder(const Encoder &E, int enc);
163 	~MultipleEncMapping();
164 
165 	DECLARE_NOCOPY(MultipleEncMapping);
166 };
167 
168 /*****************************************************************\
169  *                    struct Recoder                               *
170 \*****************************************************************/
171 struct Recoder {
172 	unsigned char Table[257];
173 
174 	void Create(const CodePage &source, const CodePage &target);
175 	void Create(const CodePage &source, const Encoder* wideTarget);
176 
177 	void Create(const CodePage &page, wchar32 (*mapper)(wchar32));
178 	void Create(const CodePage &page, const Encoder* widePage, wchar32 (*mapper)(wchar32));
179 
TrRecoder180 	unsigned char Tr(unsigned char c) const
181 	{
182 		return Table[c];
183 	}
184 	unsigned char operator [](unsigned char c) const
185 	{
186 		return Table[c];
187 	}
188 	void  Tr(const char *in, char *out, size_t len) const;
189 	void  Tr(const char *in, char *out) const;
190 	void  Tr(char *in_out, size_t len) const;
191 	void  Tr(char *in_out) const;
192 };
193 
194 extern const Encoder *EncodeTo[CODES_MAX];
195 
196 extern const struct Encoder &WideCharToYandex;
197 
198 extern const Recoder rcdr_to_yandex[];
199 extern const Recoder rcdr_from_yandex[];
200 
201 extern const Recoder rcdr_to_lower[];
202 extern const Recoder rcdr_to_upper[];
203 extern const Recoder rcdr_to_title[];
204 
to_upper(unsigned char ch)205 unsigned char CodePage::to_upper(unsigned char ch) const
206 {
207 	return rcdr_to_upper[CPEnum].Table[ch];
208 }
to_lower(unsigned char ch)209 unsigned char CodePage::to_lower(unsigned char ch) const
210 {
211 	return rcdr_to_lower[CPEnum].Table[ch];
212 }
to_title(unsigned char ch)213 unsigned char CodePage::to_title(unsigned char ch) const
214 {
215 	return rcdr_to_title[CPEnum].Table[ch];
216 }
to_digit(unsigned char ch)217 int CodePage::to_digit(unsigned char ch) const
218 {
219 	return Pire::to_digit(unicode[ch]);
220 }
221 
222 extern const struct CodePage csYandex;
223 
224 const unsigned char yaUNK_Up = 0xA6;
225 const unsigned char yaUNK_Lo = 0xB6;
226 const unsigned char yaIDEOGR = 0x9F;
227 const unsigned char yaSHY    = 0x8F;
228 const unsigned char yaACUTE  = 0x80;
229 const unsigned char yaGradus = 0xB0;
230 
utf8_leadbyte_mask(size_t len)231 unsigned char utf8_leadbyte_mask(size_t len) {
232 	// Y_ASSERT (len <= 4);
233 	return "\0\0\037\017\007"[len];
234 }
235 
utf8_rune_len(const unsigned char p)236 size_t utf8_rune_len(const unsigned char p)
237 {
238 	return "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4\0"[p>>3];
239 }
240 
utf8_rune_len_by_ucs(wchar32 rune)241 size_t utf8_rune_len_by_ucs(wchar32 rune)
242 {
243 	if (rune < 0x80)
244 		return 1U;
245 	else if (rune < 0x800)
246 		return 2U;
247 	else if (rune < 0x10000)
248 		return 3U;
249 	else if (rune < 0x200000)
250 		return 4U;
251 	else if (rune < 0x4000000)
252 		return 5U;
253 	else
254 		return 6U;
255 }
256 
257 extern const wchar32 BROKEN_RUNE;
258 
utf8_read_rune(wchar32 & rune,size_t & rune_len,const unsigned char * s,const unsigned char * end)259 RECODE_RESULT utf8_read_rune(wchar32 &rune, size_t &rune_len, const unsigned char *s, const unsigned char *end)
260 {
261 	rune = BROKEN_RUNE;
262 	rune_len = 0;
263 	wchar32 _rune;
264 
265 	size_t _len = utf8_rune_len(*s);
266 	if (s + _len > end) return RECODE_EOINPUT;  //[EOINPUT]
267 	if (_len==0) return RECODE_BROKENSYMBOL;    //[BROKENSYMBOL] in first byte
268 	_rune = *s++;                               //[00000000 0XXXXXXX]
269 
270 	if (_len > 1) {
271 		_rune &= utf8_leadbyte_mask(_len);
272 		unsigned char ch = *s++;
273 		if ((ch & 0xC0) != 0x80)
274 			return RECODE_BROKENSYMBOL;         //[BROKENSYMBOL] in second byte
275 		_rune <<= 6;
276 		_rune |= ch & 0x3F;                     //[00000XXX XXYYYYYY]
277 		if (_len > 2) {
278 			ch = *s++;
279 			if ((ch & 0xC0) != 0x80)
280 				return RECODE_BROKENSYMBOL;     //[BROKENSYMBOL] in third byte
281 			_rune <<= 6;
282 			_rune |= ch & 0x3F;                 //[XXXXYYYY YYZZZZZZ]
283 			if (_len > 3) {
284 				ch = *s;
285 				if ((ch & 0xC0) != 0x80)
286 					return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
287 				_rune <<= 6;
288 				_rune |= ch & 0x3F;             //[XXXYY YYYYZZZZ ZZQQQQQQ]
289 			}
290 		}
291 	}
292 	rune_len = _len;
293 	rune = _rune;
294 	return RECODE_OK;
295 }
296 
utf8_put_rune(wchar32 rune,size_t & rune_len,unsigned char * s,const unsigned char * end)297 RECODE_RESULT utf8_put_rune(wchar32 rune, size_t &rune_len, unsigned char *s, const unsigned char *end){
298 	rune_len = 0;
299 	size_t tail = end - s;
300 	if (rune < 0x80){
301 		if (tail <= 0) return RECODE_EOOUTPUT;
302 		*s = (unsigned char)rune;
303 		rune_len = 1;
304 		return RECODE_OK;
305 	}
306 	if (rune < 0x800){
307 		if (tail <= 1) return RECODE_EOOUTPUT;
308 		*s++ = (unsigned char)(0xC0 | (rune >> 6));
309 		*s   = (unsigned char)(0x80 | (rune & 0x3F));
310 		rune_len = 2;
311 		return RECODE_OK;
312 	}
313 	if (rune < 0x10000) {
314 		if (tail <= 2) return RECODE_EOOUTPUT;
315 		*s++ = (unsigned char)(0xE0 | (rune >> 12));
316 		*s++ = (unsigned char)(0x80 | ((rune >> 6) & 0x3F));
317 		*s   = (unsigned char)(0x80 | (rune & 0x3F));
318 		rune_len = 3;
319 		return RECODE_OK;
320 	}
321 	/*if (rune < 0x200000)*/ {
322 		if (tail <= 3) return RECODE_EOOUTPUT;
323 		*s++ = (unsigned char)(0xF0 | ((rune >> 18) & 0x07));
324 		*s++ = (unsigned char)(0x80 | ((rune >> 12) & 0x3F));
325 		*s++ = (unsigned char)(0x80 | ((rune >> 6) & 0x3F));
326 		*s   = (unsigned char)(0x80 | (rune & 0x3F));
327 		rune_len = 4;
328 		return RECODE_OK;
329 	}
330 };
331 
utf8_read_rune_from_unknown_plane(wchar32 & rune,size_t & rune_len,const wchar32 * s,const wchar32 * end)332 RECODE_RESULT utf8_read_rune_from_unknown_plane(wchar32 &rune, size_t &rune_len, const wchar32 *s, const wchar32 *end) {
333 	if ((*s & 0xFF00) != 0xF000) {
334 		rune_len = 1;
335 		rune = *s;
336 		return RECODE_OK;
337 	}
338 
339 	rune_len = 0;
340 
341 	size_t _len = utf8_rune_len((unsigned char)(*s));
342 	if (s + _len > end) return RECODE_EOINPUT;  //[EOINPUT]
343 	if (_len == 0) return RECODE_BROKENSYMBOL;  //[BROKENSYMBOL] in first byte
344 
345 	wchar32 _rune = (ui8)(*s++);                //[00000000 0XXXXXXX]
346 	if (_len > 1) {
347 		_rune &= utf8_leadbyte_mask(_len);
348 		wchar32 ch = *s++;
349 		if ((ch & 0xFFC0) != 0xF080)
350 			return RECODE_BROKENSYMBOL;         //[BROKENSYMBOL] in second byte
351 		_rune <<= 6;
352 		_rune |= ch & 0x3F;                     //[00000XXX XXYYYYYY]
353 		if (_len > 2) {
354 			ch = *s++;
355 			if ((ch & 0xFFC0) != 0xF080)
356 				return RECODE_BROKENSYMBOL;     //[BROKENSYMBOL] in third byte
357 			_rune <<= 6;
358 			_rune |= ch & 0x3F;                 //[XXXXYYYY YYZZZZZZ]
359 			if (_len > 3) {
360 				ch = *s;
361 				if ((ch & 0xFFC0) != 0xF080)
362 					return RECODE_BROKENSYMBOL; //[BROKENSYMBOL] in fourth byte
363 				_rune <<= 6;
364 				_rune |= ch & 0x3F;             //[XXXYY YYYYZZZZ ZZQQQQQQ]
365 			}
366 		}
367 	}
368 	rune_len = _len;
369 	rune = _rune;
370 	return RECODE_OK;
371 }
372 
373 /// this function changes (lowers) [end] position in case of utf-8
374 /// null character is NOT assumed or written at [*end]
375 void DecodeUnknownPlane(wchar32 *start, wchar32 *&end, const docCodes enc4unk);
376 
377 /// this function may return less than [len] bytes in case of utf-8
378 /// [dst] buffer must have at least [len] bytes
379 /// [dst] is NOT terminated with null character
380 size_t DecodeUnknownAndRecodeToYandex(const wchar32 *src, char *dst, size_t len, const docCodes enc4unk);
381 
382 void ToLower(char* s, size_t n, const CodePage& cp = csYandex) {
383 	char* const e = s + n;
384 	for (; s != e; ++s)
385 		*s = cp.to_lower(*s);
386 }
387 
388 void ToUpper(char* s, size_t n, const CodePage& cp = csYandex) {
389 	char* const e = s + n;
390 	for (; s != e; ++s)
391 		*s = cp.to_upper(*s);
392 }
393 
394 #endif
395