1 /**
2 * Yudit Unicode Editor Source File
3 *
4 * GNU Copyright (C) 1997-2006 Gaspar Sinai <gaspar@yudit.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License, version 2,
8 * dated June 1991. See file COPYYING for details.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20 /*
21 [first] [second]
22 [0x00-0x7F] [0xA1-0xDF] ->JIS X0201 same code single byte
23 [0x81-0x9F or 0xE0-0xEF] [0x40-0x7E or 0x80-0xFC] JIS X0208 2 bytes
24 [0xF0-0xFC] [0x40-0x7E or 0x80-0xFC] 2444 user-defined characters
25 */
26
27 #include "stoolkit/sencoder/SB_S_JIS0213.h"
28 #include "stoolkit/SString.h"
29 #include "stoolkit/SStringVector.h"
30 #include "stoolkit/SCluster.h"
31
32 /**
33 * This is a sample (base) implementation of the core encoding class
34 * @author: Gaspar Sinai <gaspar@yudit.org>
35 * @version: 2000-05-12
36 * E2 80 A8 E2 80 A9 are paragraph and line seps in utf-8 (U+20A8, U+20A9)
37 */
SB_S_JIS0213()38 SB_S_JIS0213::SB_S_JIS0213() : SBEncoder ("\n,\r\n,\r"),
39 sjis0213 ("shift-jis-3")
40 {
41 ok = sjis0213.isOK();
42 }
43
~SB_S_JIS0213()44 SB_S_JIS0213::~SB_S_JIS0213 ()
45 {
46 }
47
48 /**
49 * return false if this generic encoder does not exist.
50 */
51 bool
isOK() const52 SB_S_JIS0213::isOK() const
53 {
54 return ok;
55 }
56
57 /**
58 * This is encoding a unicode string into a bytestring
59 * @param input is a unicode string.
60 */
61 const SString&
encode(const SV_UCS4 & input)62 SB_S_JIS0213::encode (const SV_UCS4& input)
63 {
64 const SS_UCS4* in = input.array();
65 sstring.clear();
66 sstring.ensure(input.size()*2);
67 unsigned char c0;
68 unsigned char c1;
69
70 for (unsigned int i=0; i<input.size(); i++)
71 {
72 if (in[i] < 0x80)
73 {
74 if (in[i] == '\\')
75 {
76 sstring.append ((char)0x80);
77 }
78 else
79 {
80 sstring.append ((char) in[i]);
81 }
82 continue;
83 }
84 // half-width yen
85 if (in[i] == 0x00a5)
86 {
87 sstring.append ('\\');
88 continue;
89 }
90
91 if (!sjis0213.isOK())
92 {
93 quoteString (in[i]);
94 continue;
95 }
96 /* lift */
97 SV_UCS4 decd;
98 SV_UCS4 enc;
99 enc.append (in[i]);
100 if (i+1 < input.size()) enc.append (in[i+1]);
101 if (i+2 < input.size()) enc.append (in[i+2]);
102 unsigned int lifted = sjis0213.lift (enc, 0, false, &decd);
103 if (lifted == 0 || decd.size() == 0 || decd[0] == 0)
104 {
105 quoteString (in[i]);
106 continue;
107 }
108 c0 = (decd[0]&0xff);
109 c1 = ((decd[0]>>8)&0xff);
110 if (c1 == 0)
111 {
112 sstring.append ((char)(c0));
113 } else {
114 sstring.append ((char)(c1));
115 sstring.append ((char)(c0));
116 }
117 i = i+lifted-1;
118 }
119 return sstring;
120 }
121
122 /**
123 * Decode an input string into a unicode string.
124 * @param input is a string.
125 * he output can be null, in this case a line is not
126 * read fully. If input size is zero output will be flushed.
127 */
128 const SV_UCS4&
decode(const SString & input)129 SB_S_JIS0213::decode (const SString& input)
130 {
131 const unsigned char* in = (unsigned char*) input.array();
132 ucs4string.clear();
133 ucs4string.ensure(input.size());
134
135 for (unsigned i=0; i<input.size(); i++)
136 {
137 // MAC - backslash
138 if (in[i] == 0x80)
139 {
140 ucs4string.append ((SS_UCS4) '\\');
141 continue;
142 }
143 // half width yen
144 if (in[i] == '\\')
145 {
146 ucs4string.append ((SS_UCS4) 0x00a5);
147 continue;
148 }
149 // MAC - copyright
150 if (in[i] == 0xfd)
151 {
152 ucs4string.append ((SS_UCS4) 0xa9);
153 continue;
154 }
155 // MAC - tm
156 if (in[i] == 0xfe)
157 {
158 ucs4string.append ((SS_UCS4) 0x2122);
159 continue;
160 }
161 // MAC - ... horizontal ellipsis
162 if (in[i] == 0xff)
163 {
164 ucs4string.append ((SS_UCS4) 0x2026);
165 continue;
166 }
167 if (sjis0213.isOK())
168 {
169 /* lift */
170 SV_UCS4 ucs4;
171 SV_UCS4 decd;
172 SS_UCS4 ch = (SS_UCS4)in[i];
173 ucs4.append (ch);
174 unsigned int lifted = sjis0213.lift (ucs4, 0, true, &decd);
175 if (lifted != 0 && decd.size() != 0 && decd[0] != 0)
176 {
177 expandYuditLigatures (&decd);
178 ucs4string.append (decd);
179 continue;
180 }
181 /* try two bytes */
182 if (i+1 < input.size())
183 {
184 ucs4.clear();
185 ch = ch << 8;
186 ch += (SS_UCS4)in[i+1];
187 ucs4.append (ch);
188 lifted = sjis0213.lift (ucs4, 0, true, &decd);
189 if (lifted != 0 && decd.size() != 0 && decd[0] != 0)
190 {
191 expandYuditLigatures (&decd);
192 ucs4string.append (decd);
193 i++;
194 continue;
195 }
196 }
197 }
198
199 if (in[i] > 0x80)
200 {
201 quoteUCS4 (in[i]);
202 continue;
203 }
204 ucs4string.append ((SS_UCS4) in[i]);
205 }
206 return ucs4string;
207 }
208
209
210 /**
211 * These methods guess the line delimiters for the input
212 * The one without arguments is giving the 'first approximation'
213 * It returns an inclusive list of all possibilities.
214 */
215 const SStringVector&
delimiters()216 SB_S_JIS0213::delimiters ()
217 {
218 return realDelimiters;
219 }
220
221 /**
222 * These methods guess the line delimiters for the input
223 * The one without arguments is giving the 'first approximation'
224 * It returns an exact list
225 */
226 const SStringVector&
delimiters(const SString & sample)227 SB_S_JIS0213::delimiters (const SString& sample)
228 {
229 return sampleDelimiters;
230 }
231