1 /**
2  *  Yudit Unicode Editor Source File
3  *
4  *  GNU Copyright (C) 1997-2006  Gaspar Sinai <gaspar@yudit.org>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License, version 2,
8  *  dated June 1991. See file COPYYING for details.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program; if not, write to the Free Software
17  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */
19 
20 #include "stoolkit/sencoder/SB_Java.h"
21 #include "stoolkit/SString.h"
22 #include "stoolkit/SStringVector.h"
23 #include <stdlib.h>
24 
25 /**
26  * This is a sample (base) implementation of the core encoding class
27  * @author: Gaspar Sinai <gaspar@yudit.org>
28  * @version: 2000-05-12
29  * E2 80 A8 E2 80 A9 are paragraph and line seps in utf-8 (U+20A8, U+20A9)
30  */
SB_Java(bool _surrogate)31 SB_Java::SB_Java(bool _surrogate) : SBEncoder ("\n,\r\n,\r,\342\200\250,\342\200\251")
32 {
33   surrogate = _surrogate;
34 }
35 
~SB_Java()36 SB_Java::~SB_Java ()
37 {
38 }
39 
40 /**
41  * This is encoding a unicode string into a bytestring
42  * @param input is a unicode string.
43  */
44 const SString&
encode(const SV_UCS4 & input)45 SB_Java::encode (const SV_UCS4& input)
46 {
47   const SS_UCS4* in = input.array();
48   sstring.clear();
49   sstring.ensure(input.size()*2);
50   for (unsigned int i=0; i<input.size(); i++)
51   {
52     if (in[i]<0x80)
53     {
54       sstring.append ((char)in[i]);
55       continue;
56     }
57     SS_UCS4 c0 = in[i];
58     if (!surrogate && c0 >= 0x10000 && c0 <= 0x10ffff)
59     {
60       /* quote as surrogtes */
61       c0 = c0 - 0x10000;
62       quoteString(((c0>>10) & 0x3ff) + 0xd800);
63       quoteString((c0 & 0x3ff) + 0xdc00);
64       continue;
65     }
66     quoteString(c0);
67   }
68   return sstring;
69 }
70 
71 /**
72  * Decode an input string into a unicode string.
73  * @param input is a string.
74  *   he output can be null, in this case a line is not
75  *   read fully. If input size is zero output will be flushed.
76  */
77 const SV_UCS4&
decode(const SString & input)78 SB_Java::decode (const SString& input)
79 {
80   const unsigned char* in = (unsigned char*) input.array();
81   ucs4string.clear();
82   ucs4string.ensure(input.size());
83   SS_UCS4   decoded;
84   char*     next;
85 
86   for (unsigned i=0; i<input.size(); i++)
87   {
88     if (input.size() > i+5 && in[i] == '\\' && in[i+1] == 'u')
89     {
90       SString nin ((const char*)&in[i+2], 4);
91       nin.append ((char) 0);
92       decoded = (SS_UCS4)  strtoul (nin.array(),  &next, 16);
93       // success we append even zeros
94       if (nin.array() + 4 == next)
95       {
96         if (surrogate || decoded < 0xd800 || decoded > 0xdfff)
97         {
98           ucs4string.append (decoded);
99           i += 5;
100           continue;
101         }
102         /* lower surrogates came first */
103         if (decoded >= 0xdc00 || i + 6 + 5 >= input.size() ||  in[i+6] != '\\' || in[i+7] !=  'u')
104         {
105           quoteUCS4 ((SS_UCS2)decoded);
106           i += 5;
107           continue;
108         }
109         i += 6;
110         SString ninl ((const char*)&in[i+2], 4);
111         ninl.append ((char) 0);
112         SS_UCS4 decodedl = (SS_UCS4)  strtoul (ninl.array(),  &next, 16);
113         // success we append even zeros
114         if (ninl.array() + 4 != next || decodedl < 0xdc00 || decodedl > 0xdfff)
115         {
116           quoteUCS4 ((SS_UCS2)decoded);
117           i--; /* increment later */
118           continue;
119         }
120         ucs4string.append (((decoded&0x3ff) << 10) + (decodedl&0x3ff) + 0x10000);
121         i += 5; /* increment later */
122         continue;
123       }
124     }
125     if (input.size() > i+9 && in[i] == '\\' && in[i+1] == 'U')
126     {
127       SString nin ((const char*)&in[i+2], 8);
128       nin.append ((char) 0);
129       decoded = (SS_UCS4)  strtoul (nin.array(),  &next, 16);
130       // success we append even zeros
131       if (nin.array() + 8 == next)
132       {
133         ucs4string.append (decoded);
134         i += 9;
135         continue;
136       }
137     }
138 
139 	// life goes on.. try utf-8
140 
141     // Unexpected continuation bytes
142     if (in[i] <= 0xbf && in[i] >= 0x80)
143     {
144       quoteUCS4 (in[i]); continue;
145     }
146 
147     if ((in[i] & 0xe0) ==0xc0 && input.size()-i > 1 && (in[i+1] & 0xc0)==0x80 )
148     {
149       // check - the second
150       decoded = (((SS_UCS4)(in[i] & 0x1f)) << 6) | ((SS_UCS4) (in[i+1] & 0x3f));
151       if (decoded < 0x80)
152       {
153         quoteUCS4 ((SS_UCS2)decoded);
154       }
155       else
156       {
157         ucs4string.append (decoded);
158       }
159       i++;
160       continue;
161     }
162     if ((in[i] & 0xf0)==0xe0 && input.size()-i > 2
163       && (in[i+1] & 0xc0)==0x80 && (in[i+2] & 0xc0)==0x80)
164     {
165       decoded = (((unsigned short) (in[i] & 0x0f)) << 12)
166           | (((unsigned short) (in[i+1] & 0x3f))<<6)
167           | ((unsigned short) (in[i+2] & 0x3f));
168       if (decoded < 0x800)
169       {
170         quoteUCS4 ((SS_UCS2) decoded);
171       }
172       else
173       {
174         ucs4string.append (decoded);
175       }
176       i++;
177       i++;
178       continue;
179     }
180     if ((in[i] & 0xf8)==0xf0 && input.size()-i > 3
181       && (in[i+1] & 0xc0)==0x80 && (in[i+2] & 0xc0)==0x80
182       && (in[i+3] & 0xc0)==0x80)
183     {
184       decoded = (((unsigned int) (in[i] & 0x07)) << 18)
185         | (((unsigned int) (in[i+1] & 0x3f))<<12)
186         | (((unsigned short)(in[i+2] & 0x3f))<<6)
187         | ((unsigned short) (in[i+3] &  0x3f));
188       if (decoded < 0x10000)
189       {
190         quoteUCS4 ((SS_UCS4) decoded);
191       }
192       else
193       {
194         ucs4string.append (decoded);
195       }
196       i++;
197       i++;
198       i++;
199       continue;
200     }
201     if ((in[i] & 0xfc)==0xf8 && input.size()-i > 4
202       && (in[i+1] & 0xc0)==0x80 && (in[i+2] & 0xc0)==0x80
203       && (in[i+3] & 0xc0)==0x80 && (in[i+4] & 0xc0)==0x80)
204     {
205       decoded = (((unsigned int) (in[i] & 0x03)) << 24)
206         | (((unsigned int) (in[i+1] & 0x3f)) << 18)
207         | (((unsigned int) (in[i+2] & 0x3f))<<12)
208         | (((unsigned short) (in[i+3] & 0x3f))<<6)
209         | ((unsigned short) (in[i+4] & 0x3f));
210       if (decoded < 0x200000)
211       {
212         quoteUCS4 ((SS_UCS4) decoded);
213       }
214       else
215       {
216         ucs4string.append (decoded);
217       }
218       i++;
219       i++;
220       i++;
221       i++;
222       continue;
223     }
224     if ((in[i] & 0xfe)==0xfc && input.size()-i > 5
225       && (in[i+1] & 0xc0)==0x80 && (in[i+2] & 0xc0)==0x80
226       && (in[i+3] & 0xc0)==0x80 && (in[i+4] & 0xc0)==0x80
227       && (in[i+5] & 0xc0)==0x80)
228     {
229       decoded =  (((unsigned int) (in[i] & 0x01)) << 30)
230         | (((unsigned int) (in[i+1] & 0x3f)) << 24)
231         | (((unsigned int) (in[i+2] & 0x3f)) << 18)
232         | (((unsigned int) (in[i+3] & 0x3f))<<12)
233         | (((unsigned short)(in[i+4] & 0x3f))<<6)
234         | ((unsigned short) (in[i+5] &  0x3f));
235       if (decoded < 0x4000000)
236       {
237         quoteUCS4 ((SS_UCS4) decoded);
238       }
239       else
240       {
241         ucs4string.append (decoded);
242       }
243       i++;
244       i++;
245       i++;
246       i++;
247       i++;
248       continue;
249     }
250 
251     if (in[i] >= 0x80)
252     {
253       quoteUCS4 (in[i]);
254       continue;
255     }
256     // we translate broken utf8 into ucs2 also...
257     ucs4string.append ((SS_UCS4) in[i]);
258   }
259   return ucs4string;
260 }
261 
262 /**
263  * These methods guess the line delimiters for the input
264  * The one without arguments is giving the 'first approximation'
265  * It returns an inclusive list of all possibilities.
266  */
267 const SStringVector&
delimiters()268 SB_Java::delimiters ()
269 {
270   return realDelimiters;
271 }
272 
273 /**
274  * These methods guess the line delimiters for the input
275  * The one without arguments is giving the 'first approximation'
276  * It returns an exact list
277  */
278 const SStringVector&
delimiters(const SString & sample)279 SB_Java::delimiters (const SString& sample)
280 {
281   return sampleDelimiters;
282 }
283