1 // ----------------------------------------------------------------------------
2 // charsetdistiller.cxx  --  input charset cleaning and conversion
3 //
4 // Copyright (C) 2012
5 //		Andrej Lajovic, S57LN
6 //
7 // This file is part of fldigi.
8 //
9 // Fldigi is free software: you can redistribute it and/or modify
10 // it under the terms of the GNU General Public License as published by
11 // the Free Software Foundation, either version 3 of the License, or
12 // (at your option) any later version.
13 //
14 // Fldigi is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 // GNU General Public License for more details.
18 //
19 // You should have received a copy of the GNU General Public License
20 // along with fldigi.  If not, see <http://www.gnu.org/licenses/>.
21 // ----------------------------------------------------------------------------
22 
23 #include <config.h>
24 
25 #include <cstring>
26 #include <string>
27 
28 #include "debug.h"
29 #include "charsetdistiller.h"
30 #include "tiniconv.h"
31 
32 using namespace std;
33 
34 /*
35     CharsetDistiller
36 
37  This class implements a charset "distiller" that receives input data one
38  byte at a time and converts this data stream from a particular character
39  set into UTF-8. Invalid input data is treated as if it was encoded in
40  CP1252. Character set conversion is performed as soon as possible, i.e.,
41  when enough input is received to constitute a valid character in the input
42  character set, this character is immediatly converted into UTF-8 and made
43  available at the output.
44  */
45 
46 
47 /*
48  The constructor. Look up tiniconv.h for the list of possible values of
49  charset_in.
50 */
CharsetDistiller(const int charset_in)51 CharsetDistiller::CharsetDistiller(const int charset_in)
52 {
53    bufptr = buf;
54    nutf8 = 0;
55    tiniconv_init(charset_in, TINICONV_CHARSET_UTF_8, 0, &ctx);
56    tiniconv_init(TINICONV_CHARSET_CP1252, TINICONV_CHARSET_UTF_8, TINICONV_OPTION_IGNORE_IN_ILSEQ, &ctx1252);
57 }
58 
59 
60 /*
61  Change the input encoding. Look up tiniconv.h for the list of possible
62  values of charset_in.
63 
64  Returns 0 if successful or -1 in case of error.
65  */
set_input_encoding(const int charset_in)66 int CharsetDistiller::set_input_encoding(const int charset_in)
67 {
68    flush();
69    return tiniconv_init(charset_in, TINICONV_CHARSET_UTF_8, 0, &ctx);
70 }
71 
72 
73 /*
74  Receive a single byte of input data and make an immediate conversion
75  attempt.
76  */
rx(const unsigned char c)77 void CharsetDistiller::rx(const unsigned char c)
78 {
79    *bufptr++ = c;
80    process_buffer();
81 }
82 
83 
84 /*
85  Receive a zero-terminated string of input data.
86 
87  This is a convenience method: it merely feeds the string into the distiller
88  one byte at a time.
89  */
rx(const unsigned char * c)90 void CharsetDistiller::rx(const unsigned char *c)
91 {
92    const unsigned char *ptr;
93    for (ptr = c; *ptr != 0; ptr++)
94       rx(*ptr);
95 }
96 
97 
98 /*
99  Examine the input buffer and decide on the possible actions (construct an
100  UTF-8 character, interpret the bytes as invalid input etc.)
101  */
process_buffer(void)102 void CharsetDistiller::process_buffer(void)
103 {
104    bool again = true;
105 
106    while (again)
107    {
108       if (bufptr == buf)
109       {
110          // the buffer is empty
111          return;
112       }
113 
114       int convert_status;
115       int consumed_in;
116       int consumed_out;
117       unsigned char outbuf[6];
118 
119       convert_status = tiniconv_convert(&ctx, buf, (bufptr - buf), &consumed_in, outbuf, sizeof(outbuf), &consumed_out);
120 
121       if (consumed_out)
122       {
123          // Append the converted data to the output string.
124          outdata.append(reinterpret_cast<char *>(outbuf), consumed_out);
125 
126          // Count the number of converted UTF-8 characters (by counting the
127          // number of bytes that are not continuation bytes).
128          for (unsigned char *iptr = outbuf; iptr < outbuf + consumed_out; iptr++)
129          {
130             if ((*iptr & 0xc0) != 0x80)
131                nutf8++;
132          }
133 
134          // If not all input was consumed, move the remaining data to the
135          // beginning of the buffer
136          if (bufptr - buf > consumed_in)
137          {
138             memmove(buf, buf + consumed_in, bufptr - buf - consumed_in);
139             bufptr -= consumed_in;
140          }
141          else
142             bufptr = buf;
143       }
144 
145       again = false;
146 
147       if (convert_status == TINICONV_CONVERT_OK)
148       {
149          // Successful conversion, nothing else to do.
150          return;
151       }
152       else if (convert_status == TINICONV_CONVERT_IN_TOO_SMALL)
153       {
154          // Partial data left in the input buffer. We can't proceed with the
155          // conversion until we get more input.
156          return;
157       }
158       else if (convert_status == TINICONV_CONVERT_IN_ILSEQ)
159       {
160          // Invalid sequence in input; spit out the offending byte and try again.
161          shift_first_out();
162          again = true;
163       }
164       else if (convert_status == TINICONV_CONVERT_OUT_TOO_SMALL)
165       {
166          // More characters were available than could be converted in one
167          // go. Have another round.
168          again = true;
169       }
170       // The following two cases should never happen.
171       else if (convert_status == TINICONV_CONVERT_OUT_ILSEQ)
172       {
173          LOG_ERROR("Character not representable in UTF-8? Is this possible?");
174          bufptr = buf;
175          return;
176       }
177       else
178       {
179          LOG_ERROR("Unknown tiniconv return value %d.", convert_status);
180          bufptr = buf;
181          return;
182       }
183    }
184 }
185 
186 
187 /*
188  Convert the first byte of the input buffer; treat it as if it was encoded
189  in CP1252
190  */
shift_first_out(void)191 void CharsetDistiller::shift_first_out(void)
192 {
193    int consumed_in;
194    int consumed_out;
195    unsigned char outbuf[6];
196 
197    tiniconv_convert(&ctx1252, buf, 1, &consumed_in, outbuf, sizeof(outbuf), &consumed_out);
198 
199    outdata.append(reinterpret_cast<char *>(outbuf), consumed_out);
200    nutf8++;
201 
202    memmove(buf, buf+1, (bufptr - buf - 1));
203    bufptr--;
204 }
205 
206 
207 /*
208  Flush input. Recode the input data left in the buffer in whatever way
209  necessary to make the buffer empty.
210  */
flush(void)211 void CharsetDistiller::flush(void)
212 {
213    while (bufptr > buf)
214       shift_first_out();
215 }
216 
217 
218 /*
219  Reset input buffer. All data still waiting in the input buffer is lost.
220  Data already converted and waiting at the output is not affected.
221 */
reset(void)222 void CharsetDistiller::reset(void)
223 {
224 	bufptr = buf;
225 }
226 
227 
228 /*
229  Clear the output buffer.
230  */
clear(void)231 void CharsetDistiller::clear(void)
232 {
233    outdata.clear();
234    nutf8 = 0;
235 }
236 
237 
238 /*
239  Return the number of bytes available in the output buffer.
240  */
data_length(void)241 int CharsetDistiller::data_length(void)
242 {
243    return outdata.length();
244 }
245 
246 
247 /*
248  Return the number of UTF-8 characters in the output buffer.
249  */
num_chars(void)250 int CharsetDistiller::num_chars(void)
251 {
252    return nutf8;
253 }
254 
255 
256 /*
257  Return a reference to the output buffer.
258  */
data(void)259 const string &CharsetDistiller::data(void)
260 {
261    return outdata;
262 }
263