1
2 #include "wc.h"
3 #include "gbk.h"
4 #include "search.h"
5 #include "wtf.h"
6 #ifdef USE_UNICODE
7 #include "ucs.h"
8 #endif
9
10 #include "map/gb2312_gbk.map"
11
12 #define C0 WC_GBK_MAP_C0
13 #define GL WC_GBK_MAP_GL
14 #define C1 WC_GBK_MAP_C1
15 #define LB WC_GBK_MAP_LB
16 #define UB WC_GBK_MAP_UB
17 #define C80 WC_GBK_MAP_80
18
19 wc_uint8 WC_GBK_MAP[ 0x100 ] = {
20 C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
21 C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
22 GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL,
23 GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL,
24 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
25 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
26 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
27 LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, C0,
28
29 C80,UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
30 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
31 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
32 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
33 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
34 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
35 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
36 UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, C1,
37 };
38
39 wc_ccs
wc_gb2312_or_gbk(wc_uint16 code)40 wc_gb2312_or_gbk(wc_uint16 code) {
41 return wc_map_range_search(code,
42 gb2312_gbk_map, N_gb2312_gbk_map)
43 ? WC_CCS_GBK : WC_CCS_GB_2312;
44 }
45
46 wc_wchar_t
wc_gbk_to_cs128w(wc_wchar_t cc)47 wc_gbk_to_cs128w(wc_wchar_t cc)
48 {
49 cc.code = WC_GBK_N(cc.code);
50 if (cc.code < 0x4000)
51 cc.ccs = WC_CCS_GBK_1;
52 else {
53 cc.ccs = WC_CCS_GBK_2;
54 cc.code -= 0x4000;
55 }
56 cc.code = WC_N_CS128W(cc.code);
57 return cc;
58 }
59
60 wc_wchar_t
wc_cs128w_to_gbk(wc_wchar_t cc)61 wc_cs128w_to_gbk(wc_wchar_t cc)
62 {
63 cc.code = WC_CS128W_N(cc.code);
64 if (cc.ccs == WC_CCS_GBK_2)
65 cc.code += 0x4000;
66 cc.ccs = WC_CCS_GBK;
67 cc.code = WC_N_GBK(cc.code);
68 return cc;
69 }
70
71 wc_uint32
wc_gbk_to_N(wc_uint32 c)72 wc_gbk_to_N(wc_uint32 c)
73 {
74 if (c <= 0xA1A0) /* 0x8140 - 0xA1A0 */
75 return WC_GBK_N(c);
76 if (c <= 0xA2AA) /* 0xA240 - 0xA2A0, 0xA2A1 - 0xA2AA */
77 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E;
78 if (c <= 0xA6A0) /* 0xA240 - 0xA6A0 */
79 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A;
80 if (c <= 0xA6F5) /* 0xA6E0 - 0xA6F5 */
81 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A - 0x3F;
82 if (c <= 0xA8A0) /* 0xA7A0 - 0xA8A0 */
83 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16;
84 if (c <= 0xA8C0) /* 0xA8BB - 0xA8C0 */
85 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16 - 0x1A;
86 /* 0xA940 - 0xFEA0 */
87 return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16 + 0x06;
88 }
89
90 Str
wc_conv_from_gbk(Str is,wc_ces ces)91 wc_conv_from_gbk(Str is, wc_ces ces)
92 {
93 Str os;
94 wc_uchar *sp = (wc_uchar *)is->ptr;
95 wc_uchar *ep = sp + is->length;
96 wc_uchar *p;
97 int state = WC_GBK_NOSTATE;
98 wc_uint32 gbk;
99
100 for (p = sp; p < ep && *p < 0x80; p++)
101 ;
102 if (p == ep)
103 return is;
104 os = Strnew_size(is->length);
105 if (p > sp)
106 Strcat_charp_n(os, (char *)is->ptr, (int)(p - sp));
107
108 for (; p < ep; p++) {
109 switch (state) {
110 case WC_GBK_NOSTATE:
111 switch (WC_GBK_MAP[*p]) {
112 case UB:
113 state = WC_GBK_MBYTE1;
114 break;
115 case C80:
116 wtf_push(os, WC_CCS_GBK_80, *p);
117 break;
118 case C1:
119 wtf_push_unknown(os, p, 1);
120 break;
121 default:
122 Strcat_char(os, (char)*p);
123 break;
124 }
125 break;
126 case WC_GBK_MBYTE1:
127 if (WC_GBK_MAP[*p] & LB) {
128 gbk = ((wc_uint32)*(p-1) << 8) | *p;
129 if (*(p-1) >= 0xA1 && *p >= 0xA1)
130 wtf_push(os, wc_gb2312_or_gbk(gbk), gbk);
131 else
132 wtf_push(os, WC_CCS_GBK, gbk);
133 } else
134 wtf_push_unknown(os, p-1, 2);
135 state = WC_GBK_NOSTATE;
136 break;
137 }
138 }
139 switch (state) {
140 case WC_GBK_MBYTE1:
141 wtf_push_unknown(os, p-1, 1);
142 break;
143 }
144 return os;
145 }
146
147 void
wc_push_to_gbk(Str os,wc_wchar_t cc,wc_status * st)148 wc_push_to_gbk(Str os, wc_wchar_t cc, wc_status *st)
149 {
150 while (1) {
151 switch (cc.ccs) {
152 case WC_CCS_US_ASCII:
153 Strcat_char(os, (char)cc.code);
154 return;
155 case WC_CCS_GB_2312:
156 Strcat_char(os, (char)((cc.code >> 8) | 0x80));
157 Strcat_char(os, (char)((cc.code & 0xff) | 0x80));
158 return;
159 case WC_CCS_GBK_80:
160 Strcat_char(os, (char)(cc.code | 0x80));
161 return;
162 case WC_CCS_GBK_1:
163 case WC_CCS_GBK_2:
164 cc = wc_cs128w_to_gbk(cc);
165 case WC_CCS_GBK:
166 Strcat_char(os, (char)(cc.code >> 8));
167 Strcat_char(os, (char)(cc.code & 0xff));
168 return;
169 case WC_CCS_UNKNOWN_W:
170 if (!WcOption.no_replace)
171 Strcat_charp(os, WC_REPLACE_W);
172 return;
173 case WC_CCS_UNKNOWN:
174 if (!WcOption.no_replace)
175 Strcat_charp(os, WC_REPLACE);
176 return;
177 default:
178 #ifdef USE_UNICODE
179 if (WcOption.ucs_conv)
180 cc = wc_any_to_any_ces(cc, st);
181 else
182 #endif
183 cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN;
184 continue;
185 }
186 }
187 }
188
189 Str
wc_char_conv_from_gbk(wc_uchar c,wc_status * st)190 wc_char_conv_from_gbk(wc_uchar c, wc_status *st)
191 {
192 static Str os;
193 static wc_uchar gbku;
194 wc_uint32 gbk;
195
196 if (st->state == -1) {
197 st->state = WC_GBK_NOSTATE;
198 os = Strnew_size(8);
199 }
200
201 switch (st->state) {
202 case WC_GBK_NOSTATE:
203 switch (WC_GBK_MAP[c]) {
204 case UB:
205 gbku = c;
206 st->state = WC_GBK_MBYTE1;
207 return NULL;
208 case C80:
209 wtf_push(os, WC_CCS_GBK_80, c);
210 break;
211 case C1:
212 break;
213 default:
214 Strcat_char(os, (char)c);
215 break;
216 }
217 break;
218 case WC_GBK_MBYTE1:
219 if (WC_GBK_MAP[c] & LB) {
220 gbk = ((wc_uint32)gbku << 8) | c;
221 if (gbku >= 0xA1 && c >= 0xA1)
222 wtf_push(os, wc_gb2312_or_gbk(gbk), gbk);
223 else
224 wtf_push(os, WC_CCS_GBK, gbk);
225 }
226 break;
227 }
228 st->state = -1;
229 return os;
230 }
231