1 /*
2  * Copyright (C) 1999-2001, 2005 Free Software Foundation, Inc.
3  * This file is part of the GNU LIBICONV Library.
4  *
5  * The GNU LIBICONV Library is free software; you can redistribute it
6  * and/or modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either version 2
8  * of the License, or (at your option) any later version.
9  *
10  * The GNU LIBICONV Library is distributed in the hope that it will be
11  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public
16  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17  * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18  * Fifth Floor, Boston, MA 02110-1301, USA.
19  */
20 
21 /*
22  * GBK
23  */
24 
25 /*
26  * GBK, as described in Ken Lunde's book, is an extension of GB 2312-1980
27  * (shifted by adding 0x8080 to the range 0xA1A1..0xFEFE, as used in EUC-CN).
28  * It adds the following ranges:
29  *
30  * (part of GBK/1)  0xA2A1-0xA2AA  Small Roman numerals
31  * GBK/3   0x{81-A0}{40-7E,80-FE}  6080 new characters, all in Unicode
32  * GBK/4   0x{AA-FE}{40-7E,80-A0}  8160 new characters, 8080 in Unicode
33  * GBK/5   0x{A8-A9}{40-7E,80-A0}  166 new characters, 153 in Unicode
34  *
35  * Furthermore, all four tables I have looked at
36  *   - the CP936 table by Microsoft, found on ftp.unicode.org in 1999,
37  *   - the GBK table by Sun, investigated on a Solaris 2.7 machine,
38  *   - the GBK tables by CWEX, found in the Big5+ package,
39  *   - the GB18030 standard (second printing),
40  * agree in the following extensions. (Ken Lunde must have overlooked these
41  * differences between GB2312 and GBK. Also, the CWEX tables have additional
42  * differences.)
43  *
44  * 1. Some characters in the GB2312 range are defined differently:
45  *
46  *     code    GB2312                         GBK
47  *    0xA1A4   0x30FB # KATAKANA MIDDLE DOT   0x00B7 # MIDDLE DOT
48  *    0xA1AA   0x2015 # HORIZONTAL BAR        0x2014 # EM DASH
49  *
50  * 2. 19 characters added in the range 0xA6E0-0xA6F5.
51  *
52  * 3. 4 characters added in the range 0xA8BB-0xA8C0.
53  *
54  * CP936 as of 1999 was identical to GBK. However, since 1999, Microsoft has
55  * added new mappings to CP936...
56  */
57 
58 #include "gbkext1.h"
59 #include "gbkext2.h"
60 #include "gbkext_inv.h"
61 #include "cp936ext.h"
62 
63 static int
gbk_mbtowc(conv_t conv,ucs4_t * pwc,const unsigned char * s,int n)64 gbk_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
65 {
66   unsigned char c = *s;
67 
68   if (c >= 0x81 && c < 0xff) {
69     if (n < 2)
70       return RET_TOOFEW(0);
71     if (c >= 0xa1 && c <= 0xf7) {
72       unsigned char c2 = s[1];
73       if (c == 0xa1) {
74         if (c2 == 0xa4) {
75           *pwc = 0x00b7;
76           return 2;
77         }
78         if (c2 == 0xaa) {
79           *pwc = 0x2014;
80           return 2;
81         }
82       }
83       if (c2 >= 0xa1 && c2 < 0xff) {
84         unsigned char buf[2];
85         int ret;
86         buf[0] = c-0x80; buf[1] = c2-0x80;
87         ret = gb2312_mbtowc(conv,pwc,buf,2);
88         if (ret != RET_ILSEQ)
89           return ret;
90         buf[0] = c; buf[1] = c2;
91         ret = cp936ext_mbtowc(conv,pwc,buf,2);
92         if (ret != RET_ILSEQ)
93           return ret;
94       }
95     }
96     if (c >= 0x81 && c <= 0xa0)
97       return gbkext1_mbtowc(conv,pwc,s,2);
98     if (c >= 0xa8 && c <= 0xfe)
99       return gbkext2_mbtowc(conv,pwc,s,2);
100     if (c == 0xa2) {
101       unsigned char c2 = s[1];
102       if (c2 >= 0xa1 && c2 <= 0xaa) {
103         *pwc = 0x2170+(c2-0xa1);
104         return 2;
105       }
106     }
107   }
108   return RET_ILSEQ;
109 }
110 
111 static int
gbk_wctomb(conv_t conv,unsigned char * r,ucs4_t wc,int n)112 gbk_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
113 {
114   unsigned char buf[2];
115   int ret;
116 
117   if (wc != 0x30fb && wc != 0x2015) {
118     ret = gb2312_wctomb(conv,buf,wc,2);
119     if (ret != RET_ILUNI) {
120       if (ret != 2) abort();
121       if (n < 2)
122         return RET_TOOSMALL;
123       r[0] = buf[0]+0x80;
124       r[1] = buf[1]+0x80;
125       return 2;
126     }
127   }
128   ret = gbkext_inv_wctomb(conv,buf,wc,2);
129   if (ret != RET_ILUNI) {
130     if (ret != 2) abort();
131     if (n < 2)
132       return RET_TOOSMALL;
133     r[0] = buf[0];
134     r[1] = buf[1];
135     return 2;
136   }
137   if (wc >= 0x2170 && wc <= 0x2179) {
138     if (n < 2)
139       return RET_TOOSMALL;
140     r[0] = 0xa2;
141     r[1] = 0xa1 + (wc-0x2170);
142     return 2;
143   }
144   ret = cp936ext_wctomb(conv,buf,wc,2);
145   if (ret != RET_ILUNI) {
146     if (ret != 2) abort();
147     if (n < 2)
148       return RET_TOOSMALL;
149     r[0] = buf[0];
150     r[1] = buf[1];
151     return 2;
152   }
153   if (wc == 0x00b7) {
154     if (n < 2)
155       return RET_TOOSMALL;
156     r[0] = 0xa1;
157     r[1] = 0xa4;
158     return 2;
159   }
160   if (wc == 0x2014) {
161     if (n < 2)
162       return RET_TOOSMALL;
163     r[0] = 0xa1;
164     r[1] = 0xaa;
165     return 2;
166   }
167 
168   return RET_ILUNI;
169 }
170