1 /**
2 * Yudit Unicode Editor Source File
3 *
4 * GNU Copyright (C) 1997-2006 Gaspar Sinai <gaspar@yudit.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License, version 2,
8 * dated June 1991. See file COPYYING for details.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20 #include "stoolkit/sencoder/SB_Java.h"
21 #include "stoolkit/SString.h"
22 #include "stoolkit/SStringVector.h"
23 #include <stdlib.h>
24
25 /**
26 * This is a sample (base) implementation of the core encoding class
27 * @author: Gaspar Sinai <gaspar@yudit.org>
28 * @version: 2000-05-12
29 * E2 80 A8 E2 80 A9 are paragraph and line seps in utf-8 (U+20A8, U+20A9)
30 */
SB_Java(bool _surrogate)31 SB_Java::SB_Java(bool _surrogate) : SBEncoder ("\n,\r\n,\r,\342\200\250,\342\200\251")
32 {
33 surrogate = _surrogate;
34 }
35
~SB_Java()36 SB_Java::~SB_Java ()
37 {
38 }
39
40 /**
41 * This is encoding a unicode string into a bytestring
42 * @param input is a unicode string.
43 */
44 const SString&
encode(const SV_UCS4 & input)45 SB_Java::encode (const SV_UCS4& input)
46 {
47 const SS_UCS4* in = input.array();
48 sstring.clear();
49 sstring.ensure(input.size()*2);
50 for (unsigned int i=0; i<input.size(); i++)
51 {
52 if (in[i]<0x80)
53 {
54 sstring.append ((char)in[i]);
55 continue;
56 }
57 SS_UCS4 c0 = in[i];
58 if (!surrogate && c0 >= 0x10000 && c0 <= 0x10ffff)
59 {
60 /* quote as surrogtes */
61 c0 = c0 - 0x10000;
62 quoteString(((c0>>10) & 0x3ff) + 0xd800);
63 quoteString((c0 & 0x3ff) + 0xdc00);
64 continue;
65 }
66 quoteString(c0);
67 }
68 return sstring;
69 }
70
71 /**
72 * Decode an input string into a unicode string.
73 * @param input is a string.
74 * he output can be null, in this case a line is not
75 * read fully. If input size is zero output will be flushed.
76 */
77 const SV_UCS4&
decode(const SString & input)78 SB_Java::decode (const SString& input)
79 {
80 const unsigned char* in = (unsigned char*) input.array();
81 ucs4string.clear();
82 ucs4string.ensure(input.size());
83 SS_UCS4 decoded;
84 char* next;
85
86 for (unsigned i=0; i<input.size(); i++)
87 {
88 if (input.size() > i+5 && in[i] == '\\' && in[i+1] == 'u')
89 {
90 SString nin ((const char*)&in[i+2], 4);
91 nin.append ((char) 0);
92 decoded = (SS_UCS4) strtoul (nin.array(), &next, 16);
93 // success we append even zeros
94 if (nin.array() + 4 == next)
95 {
96 if (surrogate || decoded < 0xd800 || decoded > 0xdfff)
97 {
98 ucs4string.append (decoded);
99 i += 5;
100 continue;
101 }
102 /* lower surrogates came first */
103 if (decoded >= 0xdc00 || i + 6 + 5 >= input.size() || in[i+6] != '\\' || in[i+7] != 'u')
104 {
105 quoteUCS4 ((SS_UCS2)decoded);
106 i += 5;
107 continue;
108 }
109 i += 6;
110 SString ninl ((const char*)&in[i+2], 4);
111 ninl.append ((char) 0);
112 SS_UCS4 decodedl = (SS_UCS4) strtoul (ninl.array(), &next, 16);
113 // success we append even zeros
114 if (ninl.array() + 4 != next || decodedl < 0xdc00 || decodedl > 0xdfff)
115 {
116 quoteUCS4 ((SS_UCS2)decoded);
117 i--; /* increment later */
118 continue;
119 }
120 ucs4string.append (((decoded&0x3ff) << 10) + (decodedl&0x3ff) + 0x10000);
121 i += 5; /* increment later */
122 continue;
123 }
124 }
125 if (input.size() > i+9 && in[i] == '\\' && in[i+1] == 'U')
126 {
127 SString nin ((const char*)&in[i+2], 8);
128 nin.append ((char) 0);
129 decoded = (SS_UCS4) strtoul (nin.array(), &next, 16);
130 // success we append even zeros
131 if (nin.array() + 8 == next)
132 {
133 ucs4string.append (decoded);
134 i += 9;
135 continue;
136 }
137 }
138
139 // life goes on.. try utf-8
140
141 // Unexpected continuation bytes
142 if (in[i] <= 0xbf && in[i] >= 0x80)
143 {
144 quoteUCS4 (in[i]); continue;
145 }
146
147 if ((in[i] & 0xe0) ==0xc0 && input.size()-i > 1 && (in[i+1] & 0xc0)==0x80 )
148 {
149 // check - the second
150 decoded = (((SS_UCS4)(in[i] & 0x1f)) << 6) | ((SS_UCS4) (in[i+1] & 0x3f));
151 if (decoded < 0x80)
152 {
153 quoteUCS4 ((SS_UCS2)decoded);
154 }
155 else
156 {
157 ucs4string.append (decoded);
158 }
159 i++;
160 continue;
161 }
162 if ((in[i] & 0xf0)==0xe0 && input.size()-i > 2
163 && (in[i+1] & 0xc0)==0x80 && (in[i+2] & 0xc0)==0x80)
164 {
165 decoded = (((unsigned short) (in[i] & 0x0f)) << 12)
166 | (((unsigned short) (in[i+1] & 0x3f))<<6)
167 | ((unsigned short) (in[i+2] & 0x3f));
168 if (decoded < 0x800)
169 {
170 quoteUCS4 ((SS_UCS2) decoded);
171 }
172 else
173 {
174 ucs4string.append (decoded);
175 }
176 i++;
177 i++;
178 continue;
179 }
180 if ((in[i] & 0xf8)==0xf0 && input.size()-i > 3
181 && (in[i+1] & 0xc0)==0x80 && (in[i+2] & 0xc0)==0x80
182 && (in[i+3] & 0xc0)==0x80)
183 {
184 decoded = (((unsigned int) (in[i] & 0x07)) << 18)
185 | (((unsigned int) (in[i+1] & 0x3f))<<12)
186 | (((unsigned short)(in[i+2] & 0x3f))<<6)
187 | ((unsigned short) (in[i+3] & 0x3f));
188 if (decoded < 0x10000)
189 {
190 quoteUCS4 ((SS_UCS4) decoded);
191 }
192 else
193 {
194 ucs4string.append (decoded);
195 }
196 i++;
197 i++;
198 i++;
199 continue;
200 }
201 if ((in[i] & 0xfc)==0xf8 && input.size()-i > 4
202 && (in[i+1] & 0xc0)==0x80 && (in[i+2] & 0xc0)==0x80
203 && (in[i+3] & 0xc0)==0x80 && (in[i+4] & 0xc0)==0x80)
204 {
205 decoded = (((unsigned int) (in[i] & 0x03)) << 24)
206 | (((unsigned int) (in[i+1] & 0x3f)) << 18)
207 | (((unsigned int) (in[i+2] & 0x3f))<<12)
208 | (((unsigned short) (in[i+3] & 0x3f))<<6)
209 | ((unsigned short) (in[i+4] & 0x3f));
210 if (decoded < 0x200000)
211 {
212 quoteUCS4 ((SS_UCS4) decoded);
213 }
214 else
215 {
216 ucs4string.append (decoded);
217 }
218 i++;
219 i++;
220 i++;
221 i++;
222 continue;
223 }
224 if ((in[i] & 0xfe)==0xfc && input.size()-i > 5
225 && (in[i+1] & 0xc0)==0x80 && (in[i+2] & 0xc0)==0x80
226 && (in[i+3] & 0xc0)==0x80 && (in[i+4] & 0xc0)==0x80
227 && (in[i+5] & 0xc0)==0x80)
228 {
229 decoded = (((unsigned int) (in[i] & 0x01)) << 30)
230 | (((unsigned int) (in[i+1] & 0x3f)) << 24)
231 | (((unsigned int) (in[i+2] & 0x3f)) << 18)
232 | (((unsigned int) (in[i+3] & 0x3f))<<12)
233 | (((unsigned short)(in[i+4] & 0x3f))<<6)
234 | ((unsigned short) (in[i+5] & 0x3f));
235 if (decoded < 0x4000000)
236 {
237 quoteUCS4 ((SS_UCS4) decoded);
238 }
239 else
240 {
241 ucs4string.append (decoded);
242 }
243 i++;
244 i++;
245 i++;
246 i++;
247 i++;
248 continue;
249 }
250
251 if (in[i] >= 0x80)
252 {
253 quoteUCS4 (in[i]);
254 continue;
255 }
256 // we translate broken utf8 into ucs2 also...
257 ucs4string.append ((SS_UCS4) in[i]);
258 }
259 return ucs4string;
260 }
261
262 /**
263 * These methods guess the line delimiters for the input
264 * The one without arguments is giving the 'first approximation'
265 * It returns an inclusive list of all possibilities.
266 */
267 const SStringVector&
delimiters()268 SB_Java::delimiters ()
269 {
270 return realDelimiters;
271 }
272
273 /**
274 * These methods guess the line delimiters for the input
275 * The one without arguments is giving the 'first approximation'
276 * It returns an exact list
277 */
278 const SStringVector&
delimiters(const SString & sample)279 SB_Java::delimiters (const SString& sample)
280 {
281 return sampleDelimiters;
282 }
283