1 /**
2  *  Yudit Unicode Editor Source File
3  *
4  *  GNU Copyright (C) 1997-2006  Gaspar Sinai <gaspar@yudit.org>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License, version 2,
8  *  dated June 1991. See file COPYYING for details.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program; if not, write to the Free Software
17  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */
19 
20 #include "stoolkit/sencoder/SB_X11_KSC.h"
21 #include "stoolkit/SString.h"
22 #include "stoolkit/SStringVector.h"
23 
24 #define SS_ESC 27
25 
26 /**
27  * This is a sample (base) implementation of the core encoding class
28  * @author: Gaspar Sinai <gaspar@yudit.org>
29  * @version: 2000-05-12
30  * E2 80 A8 E2 80 A9 are paragraph and line seps in utf-8 (U+20A8, U+20A9)
31  */
SB_X11_KSC()32 SB_X11_KSC::SB_X11_KSC() : SBEncoder ("\n,\r\n,\r"), ksc_5601_r ("ksc-5601-r")
33 {
34   ok = ksc_5601_r.isOK();
35 }
36 
~SB_X11_KSC()37 SB_X11_KSC::~SB_X11_KSC ()
38 {
39 }
40 
41 /**
42  * return false if this generic encoder does not exist.
43  */
44 bool
isOK() const45 SB_X11_KSC::isOK() const
46 {
47   return ok;
48 }
49 
50 /**
51  * This is encoding a unicode string into a bytestring
52  * @param input is a unicode string.
53  */
54 const SString&
encode(const SV_UCS4 & input)55 SB_X11_KSC::encode (const SV_UCS4& input)
56 {
57   const SS_UCS4* in = input.array();
58   sstring.clear();
59   sstring.ensure(input.size()*2);
60   SS_UCS2   got;
61   SUniMap*  current=0;
62 
63   for (unsigned int i=0; i<input.size(); i++)
64   {
65     if (in[i] < 0x80)
66     {
67       if (current != 0)
68       {
69         sstring.append ((char) SS_ESC);
70         sstring.append ((char) '(');
71         sstring.append ((char) 'B');
72       }
73       current=0;
74       sstring.append ((char) in[i]);
75       continue;
76     }
77 
78     if (ksc_5601_r.isOK() && (got=ksc_5601_r.encode (in[i])) != 0)
79     {
80       if ((got&0xff00) > 0xa000 && (got&0xff00) < 0xff00
81         && (got&0xff) > 0xa0 && (got&0xff) < 0xff)
82       {
83         if (current==0)
84         {
85           sstring.append ((char) SS_ESC);
86           sstring.append ((char) '$');
87           sstring.append ((char) '(');
88           sstring.append ((char) 'C');
89         }
90         current=&ksc_5601_r;
91         sstring.append ((char) ((got&0x7f00)>>8));
92         sstring.append ((char) (got&0x7f));
93         continue;
94       }
95     }
96     quoteString (in[i]);
97   }
98   if (current != 0)
99   {
100     sstring.append ((char) SS_ESC);
101     sstring.append ((char) '(');
102     sstring.append ((char) 'B');
103   }
104   return sstring;
105 }
106 
107 /**
108  * Decode an input string into a unicode string.
109  * @param input is a string.
110  *   he output can be null, in this case a line is not
111  *   read fully. If input size is zero output will be flushed.
112  */
113 const SV_UCS4&
decode(const SString & input)114 SB_X11_KSC::decode (const SString& input)
115 {
116   const unsigned char* in = (unsigned char*) input.array();
117   ucs4string.clear();
118   ucs4string.ensure(input.size());
119   SS_UCS4   got;
120   SUniMap*  current=0;
121 
122   for (unsigned i=0; i<input.size(); i++)
123   {
124     if (input.size() > i+3 && in[i] == SS_ESC
125       && in[i+1] == '$'
126       && in[i+2] == '('
127       && in[i+3] == 'C')
128     {
129       current=&ksc_5601_r;  // Korean
130       i++; i++; i++;
131       continue;
132     }
133     if (input.size() > i+2 && in[i] == SS_ESC && in[i+1] == '('
134       && in[i+2] == 'B')
135     {
136       current=0; // ASCII
137       i++; i++;
138       continue;
139     }
140     if (input.size() > i+3 && in[i] == SS_ESC && in[i+1] == '('
141       && in[i+2] == 'A' && in[i+3] > 0x20 && in[i+3] <0x7f)
142     {
143       current=0; // 1 byte ASCII
144       i++; i++; i++;
145       ucs4string.append ((SS_UCS4) in[i]);
146       continue;
147     }
148 
149     // It should not happen but it does.
150     if (in[i] < ' ') current=0;
151 
152     if (current!=0)
153     {
154       if ( input.size() > i+1 && in[i]>0x20 && in[i] < 0x7F
155         && in[i+1] > 0x20 && in[i+1] < 0x7F)
156       {
157         got = current->decode (((((SS_UCS2)in[i]<< 8) | in[i+1]) | 0x8080 )) ;
158       }
159       else
160       {
161         got = 0;
162       }
163       if (got != 0)
164       {
165         ucs4string.append (got);
166       }
167       else
168       {
169         quoteUCS4 ((unsigned char) in[i]);
170         quoteUCS4 ((unsigned char) in[i+1]);
171       }
172       i++;
173       continue;
174     }
175     ucs4string.append ((SS_UCS4) in[i]);
176     continue;
177   }
178   return ucs4string;
179 }
180 
181 
182 /**
183  * These methods guess the line delimiters for the input
184  * The one without arguments is giving the 'first approximation'
185  * It returns an inclusive list of all possibilities.
186  */
187 const SStringVector&
delimiters()188 SB_X11_KSC::delimiters ()
189 {
190   return realDelimiters;
191 }
192 
193 /**
194  * These methods guess the line delimiters for the input
195  * The one without arguments is giving the 'first approximation'
196  * It returns an exact list
197  */
198 const SStringVector&
delimiters(const SString & sample)199 SB_X11_KSC::delimiters (const SString& sample)
200 {
201   return sampleDelimiters;
202 }
203