1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "u8-istream.h"
20
21 #include <assert.h>
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <iconv.h>
25 #include <limits.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31 #include <unistr.h>
32
33 #include "libpspp/assertion.h"
34 #include "libpspp/cast.h"
35 #include "libpspp/compiler.h"
36 #include "libpspp/encoding-guesser.h"
37 #include "libpspp/i18n.h"
38
39 #include "gl/c-strcase.h"
40 #include "gl/localcharset.h"
41 #include "gl/minmax.h"
42
43 enum u8_istream_state
44 {
45 S_AUTO, /* Stream encoding not yet known. */
46 S_UTF8, /* Stream encoding is known to be UTF-8. */
47 S_CONVERT /* Stream encoding is known but not UTF-8. */
48 };
49
50 struct u8_istream
51 {
52 int fd;
53 iconv_t converter;
54 enum u8_istream_state state;
55
56 char *buffer;
57 char *head;
58 size_t length;
59
60 char outbuf[4];
61 size_t outlen;
62 };
63
64 static ssize_t fill_buffer (struct u8_istream *);
65
66 /* Opens FILENAME, which is encoded in FROMCODE, for reading as an UTF-8
67 stream, passing FLAGS to the open() function. Returns a new u8_istream if
68 successful, otherwise returns NULL and sets errno to an appropriate value.
69
70 The accepted forms for FROMCODE are listed at the top of
71 encoding-guesser.h. */
72 struct u8_istream *
u8_istream_for_file(const char * fromcode,const char * filename,int flags)73 u8_istream_for_file (const char *fromcode, const char *filename, int flags)
74 {
75 struct u8_istream *is;
76 int fd;
77
78 assert (!(flags & O_CREAT));
79
80 fd = open (filename, flags);
81 if (fd < 0)
82 return NULL;
83
84 is = u8_istream_for_fd (fromcode, fd);
85 if (is == NULL)
86 {
87 int save_errno = errno;
88 close (fd);
89 errno = save_errno;
90 }
91
92 return is;
93 }
94
95 /* Creates and returns a new u8_istream that reads its input from FD. Returns
96 a new u8_istream if successful, otherwise returns NULL and sets errno to an
97 appropriate value.
98
99 The accepted forms for FROMCODE are listed at the top of
100 encoding-guesser.h. */
101 struct u8_istream *
u8_istream_for_fd(const char * fromcode,int fd)102 u8_istream_for_fd (const char *fromcode, int fd)
103 {
104 struct u8_istream *is;
105 const char *encoding;
106
107 is = malloc (sizeof *is);
108 if (is == NULL)
109 return NULL;
110
111 is->fd = fd;
112 is->converter = (iconv_t) -1;
113 is->buffer = malloc (U8_ISTREAM_BUFFER_SIZE);
114 if (is->buffer == NULL)
115 goto error;
116 is->head = is->buffer;
117 is->length = 0;
118 is->outlen = 0;
119
120 if (fill_buffer (is) < 0)
121 goto error;
122
123 encoding = encoding_guess_head_encoding (fromcode, is->buffer, is->length);
124 if (is_encoding_utf8 (encoding))
125 {
126 unsigned int bom_len;
127
128 is->state = S_UTF8;
129 bom_len = encoding_guess_bom_length (encoding, is->buffer, is->length);
130 is->head += bom_len;
131 is->length -= bom_len;
132 }
133 else
134 {
135 if (encoding_guess_encoding_is_auto (fromcode)
136 && !strcmp (encoding, "ASCII"))
137 {
138 is->state = S_AUTO;
139 encoding = encoding_guess_parse_encoding (fromcode);
140 }
141 else
142 is->state = S_CONVERT;
143
144 is->converter = iconv_open ("UTF-8", encoding);
145 if (is->converter == (iconv_t) -1)
146 goto error;
147 }
148
149 return is;
150
151 error:
152 u8_istream_free (is);
153 return NULL;
154 }
155
156 /* Closes IS and its underlying file descriptor and frees all associated
157 resources. Returns the return value from close(). */
158 int
u8_istream_close(struct u8_istream * is)159 u8_istream_close (struct u8_istream *is)
160 {
161 if (is != NULL)
162 {
163 int fd = is->fd;
164 u8_istream_free (is);
165 return close (fd);
166 }
167 return 0;
168 }
169
170 /* Frees IS and associated resources, but does not close the underlying file
171 descriptor. (Thus, the client must close the file descriptor when it is no
172 longer needed.) */
173 void
u8_istream_free(struct u8_istream * is)174 u8_istream_free (struct u8_istream *is)
175 {
176 if (is != NULL)
177 {
178 if (is->converter != (iconv_t) -1)
179 iconv_close (is->converter);
180 free (is->buffer);
181 free (is);
182 }
183 }
184
185 static void
substitute_invalid_input_byte(struct u8_istream * is)186 substitute_invalid_input_byte (struct u8_istream *is)
187 {
188 assert (is->outlen == 0);
189 is->head++;
190 is->length--;
191 is->outlen = u8_uctomb (CHAR_CAST (uint8_t *, is->outbuf),
192 0xfffd, sizeof is->outbuf);
193 }
194
195 static ssize_t
fill_buffer(struct u8_istream * is)196 fill_buffer (struct u8_istream *is)
197 {
198 ssize_t n;
199
200 /* Move any unused bytes to the beginning of the input buffer. */
201 if (is->length > 0 && is->buffer != is->head)
202 memmove (is->buffer, is->head, is->length);
203 is->head = is->buffer;
204
205 /* Read more input. */
206 n = 0;
207 do
208 {
209 ssize_t retval = read (is->fd, is->buffer + is->length,
210 U8_ISTREAM_BUFFER_SIZE - is->length);
211 if (retval > 0)
212 {
213 n += retval;
214 is->length += retval;
215 }
216 else if (retval == 0)
217 return n;
218 else if (errno != EINTR)
219 return n > 0 ? n : -1;
220 }
221 while (is->length < U8_ISTREAM_BUFFER_SIZE);
222 return n;
223 }
224
225 static ssize_t
read_auto(struct u8_istream * is,char * buffer,size_t size)226 read_auto (struct u8_istream *is, char *buffer, size_t size)
227 {
228 size_t original_size = size;
229 int retval = 0;
230
231 while (size > 0)
232 {
233 if (is->length > 0)
234 {
235 size_t n_ascii;
236
237 n_ascii = encoding_guess_count_ascii (is->head,
238 MIN (is->length, size));
239
240 memcpy (buffer, is->head, n_ascii);
241 buffer += n_ascii;
242 size -= n_ascii;
243
244 is->head += n_ascii;
245 is->length -= n_ascii;
246
247 if (size == 0)
248 break;
249 }
250
251 if (is->length == 0)
252 {
253 retval = fill_buffer (is);
254 if (retval > 0)
255 continue;
256 else
257 break;
258 }
259
260 /* is->head points to a byte that isn't a printable ASCII character.
261 Fill up the buffer and check for UTF-8. */
262 fill_buffer (is);
263 is->state = (encoding_guess_tail_is_utf8 (is->head, is->length)
264 ? S_UTF8 : S_CONVERT);
265 if (size == original_size)
266 return u8_istream_read (is, buffer, size);
267 break;
268 }
269
270 return original_size - size;
271 }
272
273 static int
convert_iconv(iconv_t converter,char ** inbufp,size_t * inbytesleft,char ** outbufp,size_t * outbytesleft)274 convert_iconv (iconv_t converter,
275 char **inbufp, size_t *inbytesleft,
276 char **outbufp, size_t *outbytesleft)
277 {
278 size_t n = iconv (converter, (ICONV_CONST char **) inbufp, inbytesleft,
279 outbufp, outbytesleft);
280 return n == SIZE_MAX ? errno : 0;
281 }
282
283 static int
convert_utf8(iconv_t converter UNUSED,char ** inbufp,size_t * inbytesleft,char ** outbufp,size_t * outbytesleft)284 convert_utf8 (iconv_t converter UNUSED,
285 char **inbufp, size_t *inbytesleft,
286 char **outbufp, size_t *outbytesleft)
287 {
288 const uint8_t *in = CHAR_CAST (const uint8_t *, *inbufp);
289 size_t n = MIN (*inbytesleft, *outbytesleft);
290 size_t ofs = 0;
291 int error;
292
293 for (;;)
294 {
295 ucs4_t uc;
296 int mblen;
297
298 if (ofs >= n)
299 {
300 error = ofs < *inbytesleft ? E2BIG : 0;
301 break;
302 }
303
304 mblen = u8_mbtouc (&uc, in + ofs, n - ofs);
305 if (uc == 0xfffd)
306 {
307 int retval = u8_mbtoucr (&uc, in + ofs, *inbytesleft - ofs);
308 if (retval == mblen)
309 {
310 /* There's an actual U+FFFD in the input stream. Carry on. */
311 }
312 else
313 {
314 error = (retval == -1 ? EILSEQ
315 : retval == -2 ? EINVAL
316 : E2BIG);
317 break;
318 }
319 }
320
321 ofs += mblen;
322 }
323
324 if (ofs > 0)
325 {
326 memcpy (*outbufp, *inbufp, ofs);
327 *inbufp += ofs;
328 *inbytesleft -= ofs;
329 *outbufp += ofs;
330 *outbytesleft -= ofs;
331 }
332
333 return error;
334 }
335
336 static ssize_t
read_convert(struct u8_istream * is,int (* convert)(iconv_t converter,char ** inbufp,size_t * inbytesleft,char ** outbufp,size_t * outbytesleft),char * buffer,size_t size)337 read_convert (struct u8_istream *is,
338 int (*convert) (iconv_t converter,
339 char **inbufp, size_t *inbytesleft,
340 char **outbufp, size_t *outbytesleft),
341 char *buffer, size_t size)
342 {
343 size_t original_size = size;
344
345 while (size > 0)
346 {
347 ssize_t n_read;
348
349 if (is->outlen > 0)
350 {
351 size_t n = MIN (size, is->outlen);
352
353 memcpy (buffer, is->outbuf, n);
354 is->outlen -= n;
355 if (is->outlen > 0)
356 memmove (is->outbuf, is->outbuf + n, is->outlen);
357
358 buffer += n;
359 size -= n;
360
361 if (size == 0)
362 break;
363 }
364
365 if (is->length)
366 {
367 int error = convert (is->converter,
368 &is->head, &is->length,
369 &buffer, &size);
370 if (size == 0)
371 break;
372
373 switch (error)
374 {
375 case 0:
376 /* Converted all of the input into output, possibly with space
377 for output left over.
378
379 Read more input. */
380 break;
381
382 case EILSEQ:
383 substitute_invalid_input_byte (is);
384 continue;
385
386 case EINVAL:
387 /* Incomplete byte sequence at end of input. Read more
388 input. */
389 break;
390
391 default:
392 /* A real error of some kind (ENOMEM?). */
393 return -1;
394
395 case E2BIG:
396 /* Ran out of room for output.
397 Convert into outbuf and copy from there instead. */
398 {
399 char *outptr = is->outbuf;
400 size_t outleft = sizeof is->outbuf;
401
402 error = convert (is->converter,
403 &is->head, &is->length,
404 &outptr, &outleft);
405 is->outlen = outptr - is->outbuf;
406 if (is->outlen > 0)
407 continue;
408
409 switch (error)
410 {
411 case EILSEQ:
412 substitute_invalid_input_byte (is);
413 continue;
414
415 case E2BIG:
416 case EINVAL:
417 continue;
418
419 default:
420 /* A real error of some kind (ENOMEM?). */
421 return -1;
422 }
423 }
424 }
425 }
426
427 assert (is->length <= MB_LEN_MAX);
428 n_read = fill_buffer (is);
429 if (n_read <= 0)
430 {
431 if (original_size != size)
432 {
433 /* We produced some output so don't report EOF or error yet. */
434 break;
435 }
436 else if (n_read == 0 && is->length != 0)
437 {
438 /* Incomplete byte sequence at end of file. */
439 substitute_invalid_input_byte (is);
440 }
441 else
442 {
443 /* Propagate end-of-file or error to caller. */
444 return n_read;
445 }
446 }
447 }
448
449 return original_size - size;
450 }
451
452 /* Reads up to SIZE bytes of UTF-8 text from IS into BUFFER. Returns the
453 number of bytes read if successful, 0 at end of file, or -1 if an error
454 occurred before any data could be read. Upon error, sets errno to an
455 appropriate value. */
456 ssize_t
u8_istream_read(struct u8_istream * is,char * buffer,size_t size)457 u8_istream_read (struct u8_istream *is, char *buffer, size_t size)
458 {
459 switch (is->state)
460 {
461 case S_CONVERT:
462 return read_convert (is, convert_iconv, buffer, size);
463
464 case S_AUTO:
465 return read_auto (is, buffer, size);
466
467 case S_UTF8:
468 return read_convert (is, convert_utf8, buffer, size);
469 }
470
471 NOT_REACHED ();
472 }
473
474 /* Returns the file descriptor underlying IS. */
475 int
u8_istream_fileno(const struct u8_istream * is)476 u8_istream_fileno (const struct u8_istream *is)
477 {
478 return is->fd;
479 }
480
481 /* Test functions.
482
483 These functions are probably useful only for white-box testing. */
484
485 /* Returns true if the encoding of the file being read by IS is not yet
486 known. */
487 bool
u8_istream_is_auto(const struct u8_istream * is)488 u8_istream_is_auto (const struct u8_istream *is)
489 {
490 return is->state == S_AUTO;
491 }
492
493 /* Returns true if the encoding of the file being read by IS has been
494 determined to be UTF-8. */
495 bool
u8_istream_is_utf8(const struct u8_istream * is)496 u8_istream_is_utf8 (const struct u8_istream *is)
497 {
498 return is->state == S_UTF8;
499 }
500