1 //
2 // GB18030Encoding.cs
3 //
4 // Author:
5 //	Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 using System;
8 using System.Reflection;
9 using System.Text;
10 using I18N.Common;
11 
12 #if DISABLE_UNSAFE
13 using MonoEncoder = I18N.Common.MonoSafeEncoder;
14 using MonoEncoding = I18N.Common.MonoSafeEncoding;
15 #endif
16 
17 namespace I18N.CJK
18 {
19 	[Serializable]
20 	internal class ENCgb18030 : GB18030Encoding
21 	{
ENCgb18030()22 		public ENCgb18030 (): base () {}
23 	}
24 
25 	[Serializable]
26 	public class CP54936 : GB18030Encoding { }
27 
28 	[Serializable]
29 	public class GB18030Encoding : MonoEncoding
30 	{
31 		// Constructor.
GB18030Encoding()32 		public GB18030Encoding ()
33 			: base (54936, 936)
34 		{
35 		}
36 
37 		public override string EncodingName {
38 			get { return "Chinese Simplified (GB18030)"; }
39 		}
40 
41 		public override string HeaderName {
42 			get { return "GB18030"; }
43 		}
44 
45 		public override string BodyName {
46 			get { return "GB18030"; }
47 		}
48 
49 		public override string WebName {
50 			get { return "GB18030"; }
51 		}
52 
53 		public override bool IsMailNewsDisplay {
54 			get { return true; }
55 		}
56 
57 		public override bool IsMailNewsSave {
58 			get { return true; }
59 		}
60 
61 		public override bool IsBrowserDisplay {
62 			get { return true; }
63 		}
64 
65 		public override bool IsBrowserSave {
66 			get { return true; }
67 		}
68 
GetMaxByteCount(int len)69 		public override int GetMaxByteCount (int len)
70 		{
71 			// non-GB2312 characters in \u0080 - \uFFFF
72 			return len * 4;
73 		}
74 
GetMaxCharCount(int len)75 		public override int GetMaxCharCount (int len)
76 		{
77 			return len;
78 		}
79 
80 #if !DISABLE_UNSAFE
GetByteCountImpl(char* chars, int count)81 		public unsafe override int GetByteCountImpl (char* chars, int count)
82 		{
83 			return new GB18030Encoder (this).GetByteCountImpl (chars, count, true);
84 		}
85 
GetBytesImpl(char* chars, int charCount, byte* bytes, int byteCount)86 		public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
87 		{
88 			return new GB18030Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true);
89 		}
90 #else
GetByteCount(char [] chars, int index, int length)91 		public override int GetByteCount (char [] chars, int index, int length)
92 		{
93 			return new GB18030Encoder (this).GetByteCount (chars, index, length, true);
94 		}
95 
GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)96 		public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
97 		{
98 			return new GB18030Encoder (this).GetBytes (chars, charIndex, charCount, bytes, byteIndex, true);
99 		}
100 #endif
101 
GetCharCount(byte [] bytes, int start, int len)102 		public override int GetCharCount (byte [] bytes, int start, int len)
103 		{
104 			return new GB18030Decoder ().GetCharCount (bytes, start, len);
105 		}
106 
GetChars(byte [] bytes, int byteIdx, int srclen, char [] chars, int charIdx)107 		public override int GetChars (byte [] bytes, int byteIdx, int srclen, char [] chars, int charIdx)
108 		{
109 			return new GB18030Decoder ().GetChars (bytes, byteIdx, srclen, chars, charIdx);
110 		}
111 
GetEncoder()112 		public override Encoder GetEncoder ()
113 		{
114 			return new GB18030Encoder (this);
115 		}
116 
GetDecoder()117 		public override Decoder GetDecoder ()
118 		{
119 			return new GB18030Decoder ();
120 		}
121 	}
122 
123 	class GB18030Decoder : DbcsEncoding.DbcsDecoder
124 	{
125 		static DbcsConvert gb2312 = DbcsConvert.Gb2312;
126 		// for now incomplete block is not supported - should we?
127 		// int incomplete1 = -1, incomplete2 = -1, incomplete3 = -1;
128 
GB18030Decoder()129 		public GB18030Decoder ()
130 			: base (null)
131 		{
132 		}
133 
GetCharCount(byte [] bytes, int start, int len)134 		public override int GetCharCount (byte [] bytes, int start, int len)
135 		{
136 			CheckRange (bytes, start, len);
137 
138 			int end = start + len;
139 			int ret = 0;
140 			while (start < end) {
141 				if (bytes [start] < 0x80) {
142 					ret++;
143 					start++;
144 					continue;
145 				}
146 				else if (bytes [start] == 0x80) {
147 					// Euro sign - actually it is obsolete,
148 					// now it's just reserved but not used
149 					ret++;
150 					start++;
151 					continue;
152 				}
153 				else if (bytes [start] == 0xFF) {
154 					// invalid data - fill '?'
155 					ret++;
156 					start++;
157 					continue;
158 				}
159 				else if (start + 1 >= end) {
160 //					incomplete1 = bytes [start];
161 //					incomplete2 = -1;
162 //					incomplete3 = -1;
163 					ret++;
164 					break; // incomplete tail.
165 				}
166 
167 				byte second = bytes [start + 1];
168 				if (second == 0x7F || second == 0xFF) {
169 					// invalid data
170 					ret++;
171 					start += 2;
172 					continue;
173 				}
174 				else if (0x30 <= second && second <= 0x39) {
175 					// UCS mapping
176 					if (start + 3 >= end) {
177 						// incomplete tail.
178 //						incomplete1 = bytes [start];
179 //						incomplete2 = bytes [start + 1];
180 //						if (start + 3 == end)
181 //							incomplete3 = bytes [start + 2];
182 						ret += start + 3 == end ? 3 : 2;
183 						break;
184 					}
185 					long value = GB18030Source.FromGBX (bytes, start);
186 					if (value < 0) {
187 						// invalid data.
188 						ret++;
189 						start -= (int) value;
190 					} else if (value >= 0x10000) {
191 						// UTF16 surrogate
192 						ret += 2;
193 						start += 4;
194 					} else {
195 						// UTF16 BMP
196 						ret++;
197 						start+= 4;
198 					}
199 				} else {
200 					// GB2312 mapping
201 					start += 2;
202 					ret++;
203 				}
204 			}
205 			return ret;
206 		}
207 
GetChars(byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)208 		public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
209 		{
210 			CheckRange (bytes, byteIndex, byteCount, chars, charIndex);
211 
212 			int byteEnd = byteIndex + byteCount;
213 			int charStart = charIndex;
214 
215 			while (byteIndex < byteEnd) {
216 				if (bytes [byteIndex] < 0x80) {
217 					chars [charIndex++] = (char) bytes [byteIndex++];
218 					continue;
219 				}
220 				else if (bytes [byteIndex] == 0x80) {
221 					// Euro sign - actually it is obsolete,
222 					// now it's just reserved but not used
223 					chars [charIndex++] = '\u20AC';
224 					byteIndex++;
225 					continue;
226 				}
227 				else if (bytes [byteIndex] == 0xFF) {
228 					// invalid data - fill '?'
229 					chars [charIndex++] = '?';
230 					byteIndex++;
231 					continue;
232 				}
233 				else if (byteIndex + 1 >= byteEnd) {
234 					//incomplete1 = bytes [byteIndex++];
235 					//incomplete2 = -1;
236 					//incomplete3 = -1;
237 					break; // incomplete tail.
238 				}
239 
240 				byte second = bytes [byteIndex + 1];
241 				if (second == 0x7F || second == 0xFF) {
242 					// invalid data
243 					chars [charIndex++] = '?';
244 					byteIndex += 2;
245 				}
246 				else if (0x30 <= second && second <= 0x39) {
247 					// UCS mapping
248 					if (byteIndex + 3 >= byteEnd) {
249 						// incomplete tail.
250 						//incomplete1 = bytes [byteIndex];
251 						//incomplete2 = bytes [byteIndex + 1];
252 						//if (byteIndex + 3 == byteEnd)
253 						//	incomplete3 = bytes [byteIndex + 2];
254 						break;
255 					}
256 					long value = GB18030Source.FromGBX (bytes, byteIndex);
257 					if (value < 0) {
258 						// invalid data.
259 						chars [charIndex++] = '?';
260 						byteIndex -= (int) value;
261 					} else if (value >= 0x10000) {
262 						// UTF16 surrogate
263 						value -= 0x10000;
264 						chars [charIndex++] = (char) (value / 0x400 + 0xD800);
265 						chars [charIndex++] = (char) (value % 0x400 + 0xDC00);
266 						byteIndex += 4;
267 					} else {
268 						// UTF16 BMP
269 						chars [charIndex++] = (char) value;
270 						byteIndex += 4;
271 					}
272 				} else {
273 					byte first = bytes [byteIndex];
274 					int ord = ((first - 0x81) * 191 + second - 0x40) * 2;
275 					char c1 = ord < 0 || ord >= gb2312.n2u.Length ?
276 						'\0' : (char) (gb2312.n2u [ord] + gb2312.n2u [ord + 1] * 256);
277 					if (c1 == 0)
278 						chars [charIndex++] = '?';
279 					else
280 						chars [charIndex++] = c1;
281 					byteIndex += 2;
282 				}
283 			}
284 
285 			return charIndex - charStart;
286 		}
287 	}
288 
289 	class GB18030Encoder : MonoEncoder
290 	{
291 		static DbcsConvert gb2312 = DbcsConvert.Gb2312;
292 
GB18030Encoder(MonoEncoding owner)293 		public GB18030Encoder (MonoEncoding owner)
294 			: base (owner)
295 		{
296 		}
297 
298 		char incomplete_byte_count;
299 		char incomplete_bytes;
300 
301 #if !DISABLE_UNSAFE
GetByteCountImpl(char* chars, int count, bool refresh)302 		public unsafe override int GetByteCountImpl (char* chars, int count, bool refresh)
303 		{
304 			int start = 0;
305 			int end = count;
306 			int ret = 0;
307 			while (start < end) {
308 				char ch = chars [start];
309 				if (ch < 0x80) {
310 					// ASCII
311 					ret++;
312 					start++;
313 					continue;
314 				} else if (Char.IsSurrogate (ch)) {
315 					// Surrogate
316 					if (start + 1 == end) {
317 						incomplete_byte_count = ch;
318 						start++;
319 					} else {
320 						ret += 4;
321 						start += 2;
322 					}
323 					continue;
324 				}
325 
326 				if (ch < 0x80 || ch == 0xFF) {
327 					// ASCII
328 					ret++;
329 					start++;
330 					continue;
331 				}
332 
333 				byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
334 				byte b2 = gb2312.u2n [((int) ch) * 2];
335 				if (b1 != 0 && b2 != 0) {
336 					// GB2312
337 					ret += 2;
338 					start++;
339 					continue;
340 				}
341 
342 				// non-GB2312
343 				long value = GB18030Source.FromUCS (ch);
344 				if (value < 0)
345 					ret++; // invalid(?)
346 				else
347 					ret += 4;
348 				start++;
349 			}
350 
351 			if (refresh) {
352 				if (incomplete_byte_count != char.MinValue)
353 					ret++;
354 				incomplete_byte_count = char.MinValue;
355 			}
356 			return ret;
357 		}
358 
GetBytesImpl(char* chars, int charCount, byte* bytes, int byteCount, bool refresh)359 		public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount, bool refresh)
360 		{
361 			int charIndex = 0;
362 			int byteIndex = 0;
363 
364 			int charEnd = charIndex + charCount;
365 			int byteStart = byteIndex;
366 			char ch = incomplete_bytes;
367 
368 			while (charIndex < charEnd) {
369 				if (incomplete_bytes == char.MinValue)
370 					ch = chars [charIndex++];
371 				else
372 					incomplete_bytes = char.MinValue;
373 
374 				if (ch < 0x80) {
375 					// ASCII
376 					bytes [byteIndex++] = (byte) ch;
377 					continue;
378 				} else if (Char.IsSurrogate (ch)) {
379 					// Surrogate
380 					if (charIndex == charEnd) {
381 						incomplete_bytes = ch;
382 						break; // incomplete
383 					}
384 					char ch2 = chars [charIndex++];
385 					if (!Char.IsSurrogate (ch2)) {
386 						// invalid surrogate
387 						HandleFallback (
388 							chars, ref charIndex, ref charCount,
389 							bytes, ref byteIndex, ref byteCount, null);
390 						continue;
391 					}
392 					int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00;
393 					GB18030Source.Unlinear (bytes + byteIndex, GB18030Source.FromUCSSurrogate (cp));
394 					byteIndex += 4;
395 					continue;
396 				}
397 
398 
399 				if (ch <= 0x80 || ch == 0xFF) {
400 					// Character maps to itself
401 					bytes [byteIndex++] = (byte) ch;
402 					continue;
403 				}
404 
405 				byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
406 				byte b2 = gb2312.u2n [((int) ch) * 2];
407 				if (b1 != 0 && b2 != 0) {
408 					bytes [byteIndex++] = b1;
409 					bytes [byteIndex++] = b2;
410 					continue;
411 				}
412 
413 				long value = GB18030Source.FromUCS (ch);
414 				if (value < 0)
415 					bytes [byteIndex++] = 0x3F; // invalid(?)
416 				else {
417 					// non-GB2312
418 					GB18030Source.Unlinear (bytes + byteIndex, value);
419 					byteIndex += 4;
420 				}
421 			}
422 
423 			if (refresh) {
424 				if (incomplete_bytes != char.MinValue)
425 					bytes [byteIndex++] = 0x3F; // incomplete
426 				incomplete_bytes = char.MinValue;
427 			}
428 
429 			return byteIndex - byteStart;
430 		}
431 #else
432 
GetByteCount(char[] chars, int index, int count, bool refresh)433 		public override int GetByteCount(char[] chars, int index, int count, bool refresh)
434 		{
435 			int start = 0;
436 			int end = count;
437 			int ret = 0;
438 			while (start < end)
439 			{
440 				char ch = chars[start];
441 				if (ch < 0x80)
442 				{
443 					// ASCII
444 					ret++;
445 					start++;
446 					continue;
447 				}
448 				else if (Char.IsSurrogate(ch))
449 				{
450 					// Surrogate
451 					if (start + 1 == end)
452 					{
453 						incomplete_byte_count = ch;
454 						start++;
455 					}
456 					else
457 					{
458 						ret += 4;
459 						start += 2;
460 					}
461 					continue;
462 				}
463 
464 				if (ch < 0x80 || ch == 0xFF)
465 				{
466 					// ASCII
467 					ret++;
468 					start++;
469 					continue;
470 				}
471 
472 				byte b1 = gb2312.u2n[((int)ch) * 2 + 1];
473 				byte b2 = gb2312.u2n[((int)ch) * 2];
474 				if (b1 != 0 && b2 != 0)
475 				{
476 					// GB2312
477 					ret += 2;
478 					start++;
479 					continue;
480 				}
481 
482 				// non-GB2312
483 				long value = GB18030Source.FromUCS(ch);
484 				if (value < 0)
485 					ret++; // invalid(?)
486 				else
487 					ret += 4;
488 				start++;
489 			}
490 
491 			if (refresh)
492 			{
493 				if (incomplete_byte_count != char.MinValue)
494 					ret++;
495 				incomplete_byte_count = char.MinValue;
496 			}
497 			return ret;
498 		}
499 
GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool refresh)500 		public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool refresh)
501 		{
502 			int byteCount = bytes.Length;
503 			int charEnd = charIndex + charCount;
504 			int byteStart = byteIndex;
505 			char ch = incomplete_bytes;
506 
507 			while (charIndex < charEnd)
508 			{
509 				if (incomplete_bytes == char.MinValue)
510 					ch = chars[charIndex++];
511 				else
512 					incomplete_bytes = char.MinValue;
513 
514 				if (ch < 0x80)
515 				{
516 					// ASCII
517 					bytes[byteIndex++] = (byte)ch;
518 					continue;
519 				}
520 				else if (Char.IsSurrogate(ch))
521 				{
522 					// Surrogate
523 					if (charIndex == charEnd)
524 					{
525 						incomplete_bytes = ch;
526 						break; // incomplete
527 					}
528 					char ch2 = chars[charIndex++];
529 					if (!Char.IsSurrogate(ch2))
530 					{
531 						// invalid surrogate
532 						HandleFallback (chars, ref charIndex, ref charCount,
533 							bytes, ref byteIndex, ref byteCount, null);
534 						continue;
535 					}
536 					int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00;
537 					GB18030Source.Unlinear(bytes,  byteIndex, GB18030Source.FromUCSSurrogate(cp));
538 					byteIndex += 4;
539 					continue;
540 				}
541 
542 
543 				if (ch <= 0x80 || ch == 0xFF)
544 				{
545 					// Character maps to itself
546 					bytes[byteIndex++] = (byte)ch;
547 					continue;
548 				}
549 
550 				byte b1 = gb2312.u2n[((int)ch) * 2 + 1];
551 				byte b2 = gb2312.u2n[((int)ch) * 2];
552 				if (b1 != 0 && b2 != 0)
553 				{
554 					bytes[byteIndex++] = b1;
555 					bytes[byteIndex++] = b2;
556 					continue;
557 				}
558 
559 				long value = GB18030Source.FromUCS(ch);
560 				if (value < 0)
561 					bytes[byteIndex++] = 0x3F; // invalid(?)
562 				else
563 				{
564 					// non-GB2312
565 					GB18030Source.Unlinear(bytes, byteIndex, value);
566 					byteIndex += 4;
567 				}
568 			}
569 
570 			if (refresh)
571 			{
572 				if (incomplete_bytes != char.MinValue)
573 					bytes[byteIndex++] = 0x3F; // incomplete
574 				incomplete_bytes = char.MinValue;
575 			}
576 
577 			return byteIndex - byteStart;
578 		}
579 #endif
580 	}
581 }
582