1 /**
2  *  Yudit Unicode Editor Source File
3  *
4  *  GNU Copyright (C) 1997-2006  Gaspar Sinai <gaspar@yudit.org>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License, version 2,
8  *  dated June 1991. See file COPYYING for details.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program; if not, write to the Free Software
17  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */
19 
20 /*
21     [first] [second]
22     [0x00-0x7F]  [0xA1-0xDF] ->JIS X0201 same code single byte
23     [0x81-0x9F or 0xE0-0xEF]  [0x40-0x7E or 0x80-0xFC] JIS X0208 2 bytes
24     [0xF0-0xFC]  [0x40-0x7E or 0x80-0xFC] 2444 user-defined characters
25 */
26 
27 #include "stoolkit/sencoder/SB_S_JIS0213.h"
28 #include "stoolkit/SString.h"
29 #include "stoolkit/SStringVector.h"
30 #include "stoolkit/SCluster.h"
31 
32 /**
33  * This is a sample (base) implementation of the core encoding class
34  * @author: Gaspar Sinai <gaspar@yudit.org>
35  * @version: 2000-05-12
36  * E2 80 A8 E2 80 A9 are paragraph and line seps in utf-8 (U+20A8, U+20A9)
37  */
SB_S_JIS0213()38 SB_S_JIS0213::SB_S_JIS0213() : SBEncoder ("\n,\r\n,\r"),
39   sjis0213 ("shift-jis-3")
40 {
41   ok = sjis0213.isOK();
42 }
43 
~SB_S_JIS0213()44 SB_S_JIS0213::~SB_S_JIS0213 ()
45 {
46 }
47 
48 /**
49  * return false if this generic encoder does not exist.
50  */
51 bool
isOK() const52 SB_S_JIS0213::isOK() const
53 {
54   return ok;
55 }
56 
57 /**
58  * This is encoding a unicode string into a bytestring
59  * @param input is a unicode string.
60  */
61 const SString&
encode(const SV_UCS4 & input)62 SB_S_JIS0213::encode (const SV_UCS4& input)
63 {
64   const SS_UCS4* in = input.array();
65   sstring.clear();
66   sstring.ensure(input.size()*2);
67   unsigned char c0;
68   unsigned char c1;
69 
70   for (unsigned int i=0; i<input.size(); i++)
71   {
72     if (in[i] < 0x80)
73     {
74       if (in[i] == '\\')
75       {
76         sstring.append ((char)0x80);
77       }
78       else
79       {
80         sstring.append ((char) in[i]);
81       }
82       continue;
83     }
84     // half-width yen
85     if (in[i] == 0x00a5)
86     {
87       sstring.append ('\\');
88       continue;
89     }
90 
91     if (!sjis0213.isOK())
92     {
93       quoteString (in[i]);
94       continue;
95     }
96     /* lift */
97     SV_UCS4 decd;
98     SV_UCS4 enc;
99     enc.append (in[i]);
100     if (i+1 < input.size()) enc.append (in[i+1]);
101     if (i+2 < input.size()) enc.append (in[i+2]);
102     unsigned int lifted = sjis0213.lift (enc, 0, false, &decd);
103     if (lifted == 0 || decd.size() == 0 || decd[0] == 0)
104     {
105       quoteString (in[i]);
106       continue;
107     }
108     c0 = (decd[0]&0xff);
109     c1 = ((decd[0]>>8)&0xff);
110     if (c1 == 0)
111     {
112       sstring.append ((char)(c0));
113     } else {
114       sstring.append ((char)(c1));
115       sstring.append ((char)(c0));
116     }
117     i = i+lifted-1;
118   }
119   return sstring;
120 }
121 
122 /**
123  * Decode an input string into a unicode string.
124  * @param input is a string.
125  *   he output can be null, in this case a line is not
126  *   read fully. If input size is zero output will be flushed.
127  */
128 const SV_UCS4&
decode(const SString & input)129 SB_S_JIS0213::decode (const SString& input)
130 {
131   const unsigned char* in = (unsigned char*) input.array();
132   ucs4string.clear();
133   ucs4string.ensure(input.size());
134 
135   for (unsigned i=0; i<input.size(); i++)
136   {
137     // MAC - backslash
138     if (in[i] == 0x80)
139     {
140       ucs4string.append ((SS_UCS4) '\\');
141       continue;
142     }
143     // half width yen
144     if (in[i] == '\\')
145     {
146       ucs4string.append ((SS_UCS4) 0x00a5);
147       continue;
148     }
149     // MAC - copyright
150     if (in[i] == 0xfd)
151     {
152       ucs4string.append ((SS_UCS4) 0xa9);
153       continue;
154     }
155     // MAC - tm
156     if (in[i] == 0xfe)
157     {
158       ucs4string.append ((SS_UCS4) 0x2122);
159       continue;
160     }
161     // MAC - ... horizontal ellipsis
162       if (in[i] == 0xff)
163     {
164       ucs4string.append ((SS_UCS4) 0x2026);
165       continue;
166     }
167     if (sjis0213.isOK())
168     {
169       /* lift */
170       SV_UCS4 ucs4;
171       SV_UCS4 decd;
172       SS_UCS4 ch = (SS_UCS4)in[i];
173       ucs4.append (ch);
174       unsigned int lifted = sjis0213.lift (ucs4, 0, true, &decd);
175       if (lifted != 0 && decd.size() != 0 && decd[0] != 0)
176       {
177         expandYuditLigatures (&decd);
178         ucs4string.append (decd);
179         continue;
180       }
181       /* try two  bytes */
182       if (i+1 < input.size())
183       {
184          ucs4.clear();
185          ch = ch << 8;
186          ch += (SS_UCS4)in[i+1];
187          ucs4.append (ch);
188          lifted = sjis0213.lift (ucs4, 0, true, &decd);
189          if (lifted != 0 && decd.size() != 0 && decd[0] != 0)
190          {
191            expandYuditLigatures (&decd);
192            ucs4string.append (decd);
193            i++;
194            continue;
195          }
196       }
197     }
198 
199     if (in[i] > 0x80)
200     {
201       quoteUCS4 (in[i]);
202       continue;
203     }
204     ucs4string.append ((SS_UCS4) in[i]);
205   }
206   return ucs4string;
207 }
208 
209 
210 /**
211  * These methods guess the line delimiters for the input
212  * The one without arguments is giving the 'first approximation'
213  * It returns an inclusive list of all possibilities.
214  */
215 const SStringVector&
delimiters()216 SB_S_JIS0213::delimiters ()
217 {
218   return realDelimiters;
219 }
220 
221 /**
222  * These methods guess the line delimiters for the input
223  * The one without arguments is giving the 'first approximation'
224  * It returns an exact list
225  */
226 const SStringVector&
delimiters(const SString & sample)227 SB_S_JIS0213::delimiters (const SString& sample)
228 {
229   return sampleDelimiters;
230 }
231