1 
2 #include "wc.h"
3 #include "gbk.h"
4 #include "search.h"
5 #include "wtf.h"
6 #ifdef USE_UNICODE
7 #include "ucs.h"
8 #endif
9 
10 #include "map/gb2312_gbk.map"
11 
12 #define C0 WC_GBK_MAP_C0
13 #define GL WC_GBK_MAP_GL
14 #define C1 WC_GBK_MAP_C1
15 #define LB WC_GBK_MAP_LB
16 #define UB WC_GBK_MAP_UB
17 #define C80 WC_GBK_MAP_80
18 
19 wc_uint8 WC_GBK_MAP[ 0x100 ] = {
20     C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
21     C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
22     GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL,
23     GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL,
24     LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
25     LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
26     LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
27     LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, C0,
28 
29     C80,UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
30     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
31     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
32     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
33     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
34     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
35     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
36     UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, C1,
37 };
38 
39 wc_ccs
wc_gb2312_or_gbk(wc_uint16 code)40 wc_gb2312_or_gbk(wc_uint16 code) {
41     return wc_map_range_search(code,
42 	gb2312_gbk_map, N_gb2312_gbk_map)
43 	? WC_CCS_GBK : WC_CCS_GB_2312;
44 }
45 
46 wc_wchar_t
wc_gbk_to_cs128w(wc_wchar_t cc)47 wc_gbk_to_cs128w(wc_wchar_t cc)
48 {
49     cc.code = WC_GBK_N(cc.code);
50     if (cc.code < 0x4000)
51 	cc.ccs = WC_CCS_GBK_1;
52     else {
53 	cc.ccs = WC_CCS_GBK_2;
54 	cc.code -= 0x4000;
55     }
56     cc.code = WC_N_CS128W(cc.code);
57     return cc;
58 }
59 
60 wc_wchar_t
wc_cs128w_to_gbk(wc_wchar_t cc)61 wc_cs128w_to_gbk(wc_wchar_t cc)
62 {
63     cc.code = WC_CS128W_N(cc.code);
64     if (cc.ccs == WC_CCS_GBK_2)
65 	cc.code += 0x4000;
66     cc.ccs = WC_CCS_GBK;
67     cc.code = WC_N_GBK(cc.code);
68     return cc;
69 }
70 
71 wc_uint32
wc_gbk_to_N(wc_uint32 c)72 wc_gbk_to_N(wc_uint32 c)
73 {
74     if (c <= 0xA1A0)	/* 0x8140 - 0xA1A0 */
75 	return WC_GBK_N(c);
76     if (c <= 0xA2AA)	/* 0xA240 - 0xA2A0, 0xA2A1 - 0xA2AA */
77 	return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E;
78     if (c <= 0xA6A0)	/* 0xA240 - 0xA6A0 */
79 	return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A;
80     if (c <= 0xA6F5)	/* 0xA6E0 - 0xA6F5 */
81 	return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A - 0x3F;
82     if (c <= 0xA8A0)	/* 0xA7A0 - 0xA8A0 */
83 	return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16;
84     if (c <= 0xA8C0)	/* 0xA8BB - 0xA8C0 */
85 	return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16 - 0x1A;
86 			/* 0xA940 - 0xFEA0 */
87     return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16 + 0x06;
88 }
89 
90 Str
wc_conv_from_gbk(Str is,wc_ces ces)91 wc_conv_from_gbk(Str is, wc_ces ces)
92 {
93     Str os;
94     wc_uchar *sp = (wc_uchar *)is->ptr;
95     wc_uchar *ep = sp + is->length;
96     wc_uchar *p;
97     int state = WC_GBK_NOSTATE;
98     wc_uint32 gbk;
99 
100     for (p = sp; p < ep && *p < 0x80; p++)
101 	;
102     if (p == ep)
103 	return is;
104     os = Strnew_size(is->length);
105     if (p > sp)
106 	Strcat_charp_n(os, (char *)is->ptr, (int)(p - sp));
107 
108     for (; p < ep; p++) {
109 	switch (state) {
110 	case WC_GBK_NOSTATE:
111 	    switch (WC_GBK_MAP[*p]) {
112 	    case UB:
113 		state = WC_GBK_MBYTE1;
114 		break;
115 	    case C80:
116 		wtf_push(os, WC_CCS_GBK_80, *p);
117 		break;
118 	    case C1:
119 		wtf_push_unknown(os, p, 1);
120 		break;
121 	    default:
122 		Strcat_char(os, (char)*p);
123 		break;
124 	    }
125 	    break;
126 	case WC_GBK_MBYTE1:
127 	    if (WC_GBK_MAP[*p] & LB) {
128 		gbk = ((wc_uint32)*(p-1) << 8) | *p;
129 		if (*(p-1) >= 0xA1 && *p >= 0xA1)
130 		    wtf_push(os, wc_gb2312_or_gbk(gbk), gbk);
131 		else
132 		    wtf_push(os, WC_CCS_GBK, gbk);
133 	    } else
134 		wtf_push_unknown(os, p-1, 2);
135 	    state = WC_GBK_NOSTATE;
136 	    break;
137 	}
138     }
139     switch (state) {
140     case WC_GBK_MBYTE1:
141 	wtf_push_unknown(os, p-1, 1);
142 	break;
143     }
144     return os;
145 }
146 
147 void
wc_push_to_gbk(Str os,wc_wchar_t cc,wc_status * st)148 wc_push_to_gbk(Str os, wc_wchar_t cc, wc_status *st)
149 {
150   while (1) {
151     switch (cc.ccs) {
152     case WC_CCS_US_ASCII:
153 	Strcat_char(os, (char)cc.code);
154 	return;
155     case WC_CCS_GB_2312:
156 	Strcat_char(os, (char)((cc.code >> 8) | 0x80));
157 	Strcat_char(os, (char)((cc.code & 0xff) | 0x80));
158 	return;
159     case WC_CCS_GBK_80:
160 	Strcat_char(os, (char)(cc.code | 0x80));
161 	return;
162     case WC_CCS_GBK_1:
163     case WC_CCS_GBK_2:
164 	cc = wc_cs128w_to_gbk(cc);
165     case WC_CCS_GBK:
166 	Strcat_char(os, (char)(cc.code >> 8));
167 	Strcat_char(os, (char)(cc.code & 0xff));
168 	return;
169     case WC_CCS_UNKNOWN_W:
170 	if (!WcOption.no_replace)
171 	    Strcat_charp(os, WC_REPLACE_W);
172 	return;
173     case WC_CCS_UNKNOWN:
174 	if (!WcOption.no_replace)
175 	    Strcat_charp(os, WC_REPLACE);
176 	return;
177     default:
178 #ifdef USE_UNICODE
179 	if (WcOption.ucs_conv)
180 	    cc = wc_any_to_any_ces(cc, st);
181 	else
182 #endif
183 	    cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN;
184 	continue;
185     }
186   }
187 }
188 
189 Str
wc_char_conv_from_gbk(wc_uchar c,wc_status * st)190 wc_char_conv_from_gbk(wc_uchar c, wc_status *st)
191 {
192     static Str os;
193     static wc_uchar gbku;
194     wc_uint32 gbk;
195 
196     if (st->state == -1) {
197 	st->state = WC_GBK_NOSTATE;
198 	os = Strnew_size(8);
199     }
200 
201     switch (st->state) {
202     case WC_GBK_NOSTATE:
203 	switch (WC_GBK_MAP[c]) {
204 	case UB:
205 	    gbku = c;
206 	    st->state = WC_GBK_MBYTE1;
207 	    return NULL;
208 	case C80:
209 	    wtf_push(os, WC_CCS_GBK_80, c);
210 	    break;
211 	case C1:
212 	    break;
213 	default:
214 	    Strcat_char(os, (char)c);
215 	    break;
216 	}
217 	break;
218     case WC_GBK_MBYTE1:
219 	if (WC_GBK_MAP[c] & LB) {
220 	    gbk = ((wc_uint32)gbku << 8) | c;
221 	    if (gbku >= 0xA1 && c >= 0xA1)
222 		wtf_push(os, wc_gb2312_or_gbk(gbk), gbk);
223 	    else
224 		wtf_push(os, WC_CCS_GBK, gbk);
225 	}
226 	break;
227     }
228     st->state = -1;
229     return os;
230 }
231