1 // ----------------------------------------------------------------------------
2 // charsetdistiller.cxx -- input charset cleaning and conversion
3 //
4 // Copyright (C) 2012
5 // Andrej Lajovic, S57LN
6 //
7 // This file is part of fldigi.
8 //
9 // Fldigi is free software: you can redistribute it and/or modify
10 // it under the terms of the GNU General Public License as published by
11 // the Free Software Foundation, either version 3 of the License, or
12 // (at your option) any later version.
13 //
14 // Fldigi is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 // GNU General Public License for more details.
18 //
19 // You should have received a copy of the GNU General Public License
20 // along with fldigi. If not, see <http://www.gnu.org/licenses/>.
21 // ----------------------------------------------------------------------------
22
23 #include <config.h>
24
25 #include <cstring>
26 #include <string>
27
28 #include "debug.h"
29 #include "charsetdistiller.h"
30 #include "tiniconv.h"
31
32 using namespace std;
33
34 /*
35 CharsetDistiller
36
37 This class implements a charset "distiller" that receives input data one
38 byte at a time and converts this data stream from a particular character
39 set into UTF-8. Invalid input data is treated as if it was encoded in
40 CP1252. Character set conversion is performed as soon as possible, i.e.,
41 when enough input is received to constitute a valid character in the input
42 character set, this character is immediatly converted into UTF-8 and made
43 available at the output.
44 */
45
46
47 /*
48 The constructor. Look up tiniconv.h for the list of possible values of
49 charset_in.
50 */
CharsetDistiller(const int charset_in)51 CharsetDistiller::CharsetDistiller(const int charset_in)
52 {
53 bufptr = buf;
54 nutf8 = 0;
55 tiniconv_init(charset_in, TINICONV_CHARSET_UTF_8, 0, &ctx);
56 tiniconv_init(TINICONV_CHARSET_CP1252, TINICONV_CHARSET_UTF_8, TINICONV_OPTION_IGNORE_IN_ILSEQ, &ctx1252);
57 }
58
59
60 /*
61 Change the input encoding. Look up tiniconv.h for the list of possible
62 values of charset_in.
63
64 Returns 0 if successful or -1 in case of error.
65 */
set_input_encoding(const int charset_in)66 int CharsetDistiller::set_input_encoding(const int charset_in)
67 {
68 flush();
69 return tiniconv_init(charset_in, TINICONV_CHARSET_UTF_8, 0, &ctx);
70 }
71
72
73 /*
74 Receive a single byte of input data and make an immediate conversion
75 attempt.
76 */
rx(const unsigned char c)77 void CharsetDistiller::rx(const unsigned char c)
78 {
79 *bufptr++ = c;
80 process_buffer();
81 }
82
83
84 /*
85 Receive a zero-terminated string of input data.
86
87 This is a convenience method: it merely feeds the string into the distiller
88 one byte at a time.
89 */
rx(const unsigned char * c)90 void CharsetDistiller::rx(const unsigned char *c)
91 {
92 const unsigned char *ptr;
93 for (ptr = c; *ptr != 0; ptr++)
94 rx(*ptr);
95 }
96
97
98 /*
99 Examine the input buffer and decide on the possible actions (construct an
100 UTF-8 character, interpret the bytes as invalid input etc.)
101 */
process_buffer(void)102 void CharsetDistiller::process_buffer(void)
103 {
104 bool again = true;
105
106 while (again)
107 {
108 if (bufptr == buf)
109 {
110 // the buffer is empty
111 return;
112 }
113
114 int convert_status;
115 int consumed_in;
116 int consumed_out;
117 unsigned char outbuf[6];
118
119 convert_status = tiniconv_convert(&ctx, buf, (bufptr - buf), &consumed_in, outbuf, sizeof(outbuf), &consumed_out);
120
121 if (consumed_out)
122 {
123 // Append the converted data to the output string.
124 outdata.append(reinterpret_cast<char *>(outbuf), consumed_out);
125
126 // Count the number of converted UTF-8 characters (by counting the
127 // number of bytes that are not continuation bytes).
128 for (unsigned char *iptr = outbuf; iptr < outbuf + consumed_out; iptr++)
129 {
130 if ((*iptr & 0xc0) != 0x80)
131 nutf8++;
132 }
133
134 // If not all input was consumed, move the remaining data to the
135 // beginning of the buffer
136 if (bufptr - buf > consumed_in)
137 {
138 memmove(buf, buf + consumed_in, bufptr - buf - consumed_in);
139 bufptr -= consumed_in;
140 }
141 else
142 bufptr = buf;
143 }
144
145 again = false;
146
147 if (convert_status == TINICONV_CONVERT_OK)
148 {
149 // Successful conversion, nothing else to do.
150 return;
151 }
152 else if (convert_status == TINICONV_CONVERT_IN_TOO_SMALL)
153 {
154 // Partial data left in the input buffer. We can't proceed with the
155 // conversion until we get more input.
156 return;
157 }
158 else if (convert_status == TINICONV_CONVERT_IN_ILSEQ)
159 {
160 // Invalid sequence in input; spit out the offending byte and try again.
161 shift_first_out();
162 again = true;
163 }
164 else if (convert_status == TINICONV_CONVERT_OUT_TOO_SMALL)
165 {
166 // More characters were available than could be converted in one
167 // go. Have another round.
168 again = true;
169 }
170 // The following two cases should never happen.
171 else if (convert_status == TINICONV_CONVERT_OUT_ILSEQ)
172 {
173 LOG_ERROR("Character not representable in UTF-8? Is this possible?");
174 bufptr = buf;
175 return;
176 }
177 else
178 {
179 LOG_ERROR("Unknown tiniconv return value %d.", convert_status);
180 bufptr = buf;
181 return;
182 }
183 }
184 }
185
186
187 /*
188 Convert the first byte of the input buffer; treat it as if it was encoded
189 in CP1252
190 */
shift_first_out(void)191 void CharsetDistiller::shift_first_out(void)
192 {
193 int consumed_in;
194 int consumed_out;
195 unsigned char outbuf[6];
196
197 tiniconv_convert(&ctx1252, buf, 1, &consumed_in, outbuf, sizeof(outbuf), &consumed_out);
198
199 outdata.append(reinterpret_cast<char *>(outbuf), consumed_out);
200 nutf8++;
201
202 memmove(buf, buf+1, (bufptr - buf - 1));
203 bufptr--;
204 }
205
206
207 /*
208 Flush input. Recode the input data left in the buffer in whatever way
209 necessary to make the buffer empty.
210 */
flush(void)211 void CharsetDistiller::flush(void)
212 {
213 while (bufptr > buf)
214 shift_first_out();
215 }
216
217
218 /*
219 Reset input buffer. All data still waiting in the input buffer is lost.
220 Data already converted and waiting at the output is not affected.
221 */
reset(void)222 void CharsetDistiller::reset(void)
223 {
224 bufptr = buf;
225 }
226
227
228 /*
229 Clear the output buffer.
230 */
clear(void)231 void CharsetDistiller::clear(void)
232 {
233 outdata.clear();
234 nutf8 = 0;
235 }
236
237
238 /*
239 Return the number of bytes available in the output buffer.
240 */
data_length(void)241 int CharsetDistiller::data_length(void)
242 {
243 return outdata.length();
244 }
245
246
247 /*
248 Return the number of UTF-8 characters in the output buffer.
249 */
num_chars(void)250 int CharsetDistiller::num_chars(void)
251 {
252 return nutf8;
253 }
254
255
256 /*
257 Return a reference to the output buffer.
258 */
data(void)259 const string &CharsetDistiller::data(void)
260 {
261 return outdata;
262 }
263