1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2014 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "line-reader.h"
20
21 #include <assert.h>
22 #include <errno.h>
23 #include <fcntl.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
27
28 #include "libpspp/assertion.h"
29 #include "libpspp/encoding-guesser.h"
30 #include "libpspp/i18n.h"
31 #include "libpspp/str.h"
32
33 #include "gl/minmax.h"
34 #include "gl/xalloc.h"
35
36 enum line_reader_state
37 {
38 S_UNIBYTE, /* Known stream encoding, 1-byte unit. */
39 S_MULTIBYTE, /* Known stream encoding, multibyte unit. */
40 S_AUTO /* Encoding autodetection in progress. */
41 };
42
43 struct line_reader
44 {
45 int fd;
46 enum line_reader_state state;
47 struct encoding_info encoding_info;
48
49 char *encoding; /* Current encoding. */
50 char *auto_encoding; /* In S_AUTO mode, user-specified encoding. */
51
52 char *buffer;
53 char *head;
54 size_t length;
55
56 int error;
57 bool eof;
58 };
59
60 static ssize_t fill_buffer (struct line_reader *);
61
62 /* Opens FILENAME, which is encoded in ENCODING, for reading line by line,
63 passing FLAGS to the open() function. Returns a new line_reader if
64 successful, otherwise returns NULL and sets errno to an appropriate value.
65
66 The accepted forms for ENCODING are listed at the top of
67 encoding-guesser.h. */
68 struct line_reader *
line_reader_for_file(const char * encoding,const char * filename,int flags)69 line_reader_for_file (const char *encoding, const char *filename, int flags)
70 {
71 struct line_reader *r;
72 int fd;
73
74 assert (!(flags & O_CREAT));
75
76 fd = open (filename, flags);
77 if (fd < 0)
78 return NULL;
79
80 r = line_reader_for_fd (encoding, fd);
81 if (r == NULL)
82 {
83 int save_errno = errno;
84 close (fd);
85 errno = save_errno;
86 }
87
88 return r;
89 }
90
91 /* Creates and returns a new line_reader that reads its input from FD. Returns
92 a new line_reader if successful, otherwise returns NULL and sets errno to an
93 appropriate value.
94
95 The accepted forms for ENCODING are listed at the top of
96 encoding-guesser.h. */
97 struct line_reader *
line_reader_for_fd(const char * encoding,int fd)98 line_reader_for_fd (const char *encoding, int fd)
99 {
100 struct line_reader *r;
101
102 r = calloc (1, sizeof *r);
103 if (r == NULL)
104 return NULL;
105
106 r->fd = fd;
107 r->buffer = malloc (LINE_READER_BUFFER_SIZE);
108 if (r->buffer == NULL)
109 goto error;
110 r->head = r->buffer;
111 r->length = 0;
112
113 if (fill_buffer (r) < 0)
114 goto error;
115
116 r->encoding = xstrdup (encoding_guess_head_encoding (
117 encoding, r->buffer, r->length));
118 if (!get_encoding_info (&r->encoding_info, r->encoding))
119 {
120 errno = EINVAL;
121 goto error;
122 }
123
124 if (encoding_guess_encoding_is_auto (encoding)
125 && !strcmp (r->encoding, "ASCII"))
126 {
127 r->state = S_AUTO;
128 r->auto_encoding = encoding ? xstrdup (encoding) : NULL;
129 }
130 else
131 r->state = r->encoding_info.unit == 1 ? S_UNIBYTE : S_MULTIBYTE;
132
133 return r;
134
135 error:
136 line_reader_free (r);
137 return NULL;
138 }
139
140 /* Closes R and its underlying file descriptor and frees all associated
141 resources. Returns the return value from close(). */
142 int
line_reader_close(struct line_reader * r)143 line_reader_close (struct line_reader *r)
144 {
145 if (r != NULL)
146 {
147 int fd = r->fd;
148 line_reader_free (r);
149 return close (fd);
150 }
151 return 0;
152 }
153
154 /* Frees R and associated resources, but does not close the underlying file
155 descriptor. (Thus, the client must close the file descriptor when it is no
156 longer needed.) */
157 void
line_reader_free(struct line_reader * r)158 line_reader_free (struct line_reader *r)
159 {
160 if (r != NULL)
161 {
162 free (r->buffer);
163 free (r->encoding);
164 free (r->auto_encoding);
165 free (r);
166 }
167 }
168
169 static ssize_t
fill_buffer(struct line_reader * r)170 fill_buffer (struct line_reader *r)
171 {
172 ssize_t n;
173
174 /* Move any unused bytes to the beginning of the input buffer. */
175 if (r->length > 0 && r->buffer != r->head)
176 memmove (r->buffer, r->head, r->length);
177 r->head = r->buffer;
178
179 /* Read more input. */
180 do
181 {
182 n = read (r->fd, r->buffer + r->length,
183 LINE_READER_BUFFER_SIZE - r->length);
184 }
185 while (n < 0 && errno == EINTR);
186 if (n > 0)
187 r->length += n;
188 else if (n < 0)
189 r->error = errno;
190 else
191 r->eof = true;
192 return n;
193 }
194
195 static void
output_bytes(struct line_reader * r,struct string * s,size_t n)196 output_bytes (struct line_reader *r, struct string *s, size_t n)
197 {
198 ds_put_substring (s, ss_buffer (r->head, n));
199 r->head += n;
200 r->length -= n;
201 }
202
203 static void
output_line(struct line_reader * r,struct string * s,size_t n)204 output_line (struct line_reader *r, struct string *s, size_t n)
205 {
206 int unit = r->encoding_info.unit;
207
208 output_bytes (r, s, n);
209
210 r->head += unit;
211 r->length -= unit;
212
213 ds_chomp (s, ss_buffer (r->encoding_info.cr, unit));
214 }
215
216 /* Reads a line of text, but no more than MAX_LENGTH bytes, from R and appends
217 it to S, omitting the final new-line and the carriage return that
218 immediately precedes it, if one is present. The line is left in its
219 original encoding.
220
221 Returns true if anything was successfully read from the file. (If an empty
222 line was read, then nothing is appended to S.) Returns false if end of file
223 was reached or a read error occurred before any text could be read. */
224 bool
line_reader_read(struct line_reader * r,struct string * s,size_t max_length)225 line_reader_read (struct line_reader *r, struct string *s, size_t max_length)
226 {
227 size_t original_length = ds_length (s);
228 int unit = r->encoding_info.unit;
229
230 do
231 {
232 size_t max_out = max_length - (ds_length (s) - original_length);
233 size_t max_in = r->length;
234 size_t max = MIN (max_in, max_out);
235 size_t n;
236 char *p;
237
238 if (max_out < unit)
239 break;
240
241 switch (r->state)
242 {
243 case S_UNIBYTE:
244 p = memchr (r->head, r->encoding_info.lf[0], max);
245 if (p != NULL)
246 {
247 output_line (r, s, p - r->head);
248 return true;
249 }
250 n = max;
251 break;
252
253 case S_MULTIBYTE:
254 for (n = 0; n + unit <= max; n += unit)
255 if (!memcmp (r->head + n, r->encoding_info.lf, unit))
256 {
257 output_line (r, s, n);
258 return true;
259 }
260 break;
261
262 case S_AUTO:
263 for (n = 0; n < max; n++)
264 if (!encoding_guess_is_ascii_text (r->head[n]))
265 {
266 char *encoding;
267
268 output_bytes (r, s, n);
269 fill_buffer (r);
270 r->state = S_UNIBYTE;
271
272 encoding = xstrdup (encoding_guess_tail_encoding (
273 r->auto_encoding, r->head, r->length));
274 free (r->encoding);
275 r->encoding = encoding;
276
277 free (r->auto_encoding);
278 r->auto_encoding = NULL;
279
280 n = 0;
281 break;
282 }
283 else if (r->head[n] == '\n')
284 {
285 output_line (r, s, n);
286 return true;
287 }
288 break;
289
290 default:
291 NOT_REACHED ();
292 }
293
294 output_bytes (r, s, n);
295 }
296 while (r->length >= unit || fill_buffer (r) > 0);
297
298 return ds_length (s) > original_length;
299 }
300
301 /* Returns the file descriptor underlying R. */
302 int
line_reader_fileno(const struct line_reader * r)303 line_reader_fileno (const struct line_reader *r)
304 {
305 return r->fd;
306 }
307
308 /* Returns the offset in the file of the next byte to be read from R, or -1 on
309 error (e.g. if the file is not seekable). */
310 off_t
line_reader_tell(const struct line_reader * r)311 line_reader_tell (const struct line_reader *r)
312 {
313 off_t pos = lseek (r->fd, 0, SEEK_CUR);
314 return (pos < 0 ? pos
315 : pos >= r->length ? pos - r->length
316 : 0);
317 }
318
319 /* Returns true if end of file has been encountered reading R. */
320 bool
line_reader_eof(const struct line_reader * r)321 line_reader_eof (const struct line_reader *r)
322 {
323 return r->eof && !r->length;
324 }
325
326 /* Returns an nonzero errno value if an error has been encountered reading
327 R, zero otherwise. */
328 int
line_reader_error(const struct line_reader * r)329 line_reader_error (const struct line_reader *r)
330 {
331 return !r->length ? r->error : 0;
332 }
333
334 /* Returns the encoding of R. If line_reader_is_auto(R) returns true, the
335 encoding might change as more lines are read. */
336 const char *
line_reader_get_encoding(const struct line_reader * r)337 line_reader_get_encoding (const struct line_reader *r)
338 {
339 return r->encoding;
340 }
341
342 /* Returns true if the encoding of the file being read by R is not yet
343 completely known. If this function returns true, then the encoding returned
344 by line_reader_get_encoding() might change as more lines are read (and after
345 the change, this function will return false). */
346 bool
line_reader_is_auto(const struct line_reader * r)347 line_reader_is_auto (const struct line_reader *r)
348 {
349 return r->state == S_AUTO;
350 }
351