1 /********************************************************************/
2 /* */
3 /* ut8_rtl.c Primitive actions for the UTF-8 file type. */
4 /* Copyright (C) 1989 - 2015, 2018 Thomas Mertes */
5 /* */
6 /* This file is part of the Seed7 Runtime Library. */
7 /* */
8 /* The Seed7 Runtime Library is free software; you can */
9 /* redistribute it and/or modify it under the terms of the GNU */
10 /* Lesser General Public License as published by the Free Software */
11 /* Foundation; either version 2.1 of the License, or (at your */
12 /* option) any later version. */
13 /* */
14 /* The Seed7 Runtime Library is distributed in the hope that it */
15 /* will be useful, but WITHOUT ANY WARRANTY; without even the */
16 /* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR */
17 /* PURPOSE. See the GNU Lesser General Public License for more */
18 /* details. */
19 /* */
20 /* You should have received a copy of the GNU Lesser General */
21 /* Public License along with this program; if not, write to the */
22 /* Free Software Foundation, Inc., 51 Franklin Street, */
23 /* Fifth Floor, Boston, MA 02110-1301, USA. */
24 /* */
25 /* Module: Seed7 Runtime Library */
26 /* File: seed7/src/ut8_rtl.c */
27 /* Changes: 2005, 2010, 2013, 2014 Thomas Mertes */
28 /* Content: Primitive actions for the UTF-8 file type. */
29 /* */
30 /********************************************************************/
31
32 #define LOG_FUNCTIONS 0
33 #define VERBOSE_EXCEPTIONS 0
34
35 #include "version.h"
36
37 #include "stdlib.h"
38 #include "stdio.h"
39 #include "string.h"
40 #include "sys/types.h"
41 #include "errno.h"
42
43 #include "common.h"
44 #include "os_decls.h"
45 #include "heaputl.h"
46 #include "striutl.h"
47 #include "fil_rtl.h"
48 #include "rtl_err.h"
49
50 #undef EXTERN
51 #define EXTERN
52 #include "ut8_rtl.h"
53
54
55 #define BUFFER_SIZE 2048
56 #define GETS_DEFAULT_SIZE 1048576
57 #define GETS_STRI_SIZE_DELTA 4096
58 #define READ_STRI_INIT_SIZE 256
59 #define READ_STRI_SIZE_DELTA 2048
60 #define WRITE_STRI_BLOCK_SIZE 512
61
62
63 typedef struct {
64 memSizeType bytes_remaining;
65 memSizeType bytes_missing;
66 memSizeType chars_read;
67 memSizeType chars_there;
68 } readStateType;
69
70
71
bytes_to_strelements(ustriType buffer,memSizeType bytes_in_buffer,strElemType * stri_dest,readStateType * state,errInfoType * err_info)72 static inline void bytes_to_strelements (ustriType buffer, memSizeType bytes_in_buffer,
73 strElemType *stri_dest, readStateType *state, errInfoType *err_info)
74
75 { /* bytes_to_strelements */
76 if (bytes_in_buffer != 0) {
77 bytes_in_buffer += state->bytes_remaining;
78 /* printf("#1# bytes_in_buffer=%d %X %X\n", bytes_in_buffer, buffer[0], buffer[1]); */
79 state->bytes_remaining = utf8_to_stri(stri_dest, &state->chars_read, buffer,
80 bytes_in_buffer);
81 if (state->bytes_remaining != 0) {
82 /* printf("#2# bytes_remaining=%d %X\n", state->bytes_remaining,
83 buffer[bytes_in_buffer - state->bytes_remaining]); */
84 state->bytes_missing = utf8_bytes_missing(
85 &buffer[bytes_in_buffer - state->bytes_remaining],
86 state->bytes_remaining);
87 /* printf("#3# bytes_missing=%d\n", state->bytes_missing); */
88 if (state->bytes_missing != 0) {
89 memmove(buffer, &buffer[bytes_in_buffer - state->bytes_remaining],
90 state->bytes_remaining);
91 /* printf("#4# %X %X\n", buffer[0], buffer[1]); */
92 state->chars_there = 1;
93 } else {
94 /* printf("#5# bytes_in_buffer=%d bytes_remaining=%d bytes_missing=%d "
95 "chars_requested=%d chars_missing=%d %X ftell=%ld\n",
96 bytes_in_buffer, state->bytes_remaining, state->bytes_missing,
97 chars_requested, chars_missing,
98 buffer[bytes_in_buffer - state->bytes_remaining],
99 ftell(aFile)); */
100 *err_info = RANGE_ERROR;
101 return;
102 } /* if */
103 } else {
104 state->bytes_missing = 0;
105 state->chars_there = 0;
106 } /* if */
107 } else {
108 state->chars_read = 0;
109 } /* if */
110 /* printf("#6# chars_read=%d\n", state->chars_read); */
111 } /* bytes_to_strelements */
112
113
114
115 /**
116 * Read UTF-8 characters from 'inFile' into the allocated string.
117 * Read until the string 'stri' is filled or 'inFile' reaches EOF.
118 * @param inFile File from which UTF-8 encoded characters are read.
119 * @param stri An allocated string for the requested number of chars.
120 * @param err_info Unchanged if the function succeeds, and
121 * RANGE_ERROR if inFile contains illegal encodings, and
122 * FILE_ERROR if a system function returns an error.
123 * @return the actual number of characters read.
124 */
read_utf8_string(cFileType inFile,striType stri,errInfoType * err_info)125 static memSizeType read_utf8_string (cFileType inFile, striType stri, errInfoType *err_info)
126
127 {
128 ucharType buffer[BUFFER_SIZE + 6];
129 memSizeType bytes_in_buffer;
130 memSizeType stri_pos;
131 memSizeType chars_missing;
132 readStateType state = {0, 0, 1, 0};
133
134 /* read_utf8_string */
135 for (stri_pos = 0, chars_missing = stri->size;
136 chars_missing >= BUFFER_SIZE - state.bytes_missing + state.chars_there &&
137 (state.chars_read > 0 || state.chars_there) &&
138 *err_info == OKAY_NO_ERROR;
139 stri_pos += state.chars_read, chars_missing -= state.chars_read) {
140 bytes_in_buffer = (memSizeType) fread(&buffer[state.bytes_remaining], 1,
141 BUFFER_SIZE, inFile);
142 if (bytes_in_buffer == 0 && stri_pos == 0 && ferror(inFile)) {
143 logError(printf("read_utf8_string: fread(*, 1, " FMT_U_MEM ", %d) failed.\n",
144 (memSizeType) BUFFER_SIZE, safe_fileno(inFile)););
145 *err_info = FILE_ERROR;
146 } else {
147 /* printf("#A# bytes_in_buffer=%d num_of_chars_read=%d\n",
148 bytes_in_buffer, stri_pos); */
149 bytes_to_strelements(buffer, bytes_in_buffer, &stri->mem[stri_pos],
150 &state, err_info);
151 } /* if */
152 } /* for */
153 for (; chars_missing > 0 && (state.chars_read > 0 || state.chars_there) &&
154 *err_info == OKAY_NO_ERROR;
155 stri_pos += state.chars_read, chars_missing -= state.chars_read) {
156 bytes_in_buffer = (memSizeType) fread(&buffer[state.bytes_remaining], 1,
157 chars_missing - state.chars_there + state.bytes_missing, inFile);
158 if (bytes_in_buffer == 0 && stri_pos == 0 && ferror(inFile)) {
159 logError(printf("read_utf8_string: fread(*, 1, " FMT_U_MEM ", %d) failed.\n",
160 chars_missing - state.chars_there + state.bytes_missing,
161 safe_fileno(inFile)););
162 *err_info = FILE_ERROR;
163 } else {
164 /* printf("#B# bytes_in_buffer=%d chars_missing=%d chars_read=%d "
165 "chars_there=%d bytes_missing=%d num_of_chars_read=%d\n",
166 bytes_in_buffer, chars_missing, chars_read, chars_there,
167 state.bytes_missing, stri_pos); */
168 bytes_to_strelements(buffer, bytes_in_buffer, &stri->mem[stri_pos],
169 &state, err_info);
170 } /* if */
171 } /* for */
172 return stri_pos;
173 } /* read_utf8_string */
174
175
176
177 /**
178 * Read up to 'chars_missing' UTF-8 characters from 'inFile'.
179 * Read until 'chars_missing' characters are read or 'inFile' reaches EOF.
180 * @param inFile File from which UTF-8 encoded characters are read.
181 * @param chars_missing Maximum number of characters to be read.
182 * @param num_of_chars_read Address to which the actual number of
183 * characters read is assigned.
184 * @param err_info Unchanged if the function succeeds, and
185 * RANGE_ERROR if inFile contains illegal encodings, and
186 * FILE_ERROR if a system function returns an error, and
187 * MEMORY_ERROR if there was not enough memory.
188 */
read_and_alloc_utf8_stri(cFileType inFile,memSizeType chars_missing,memSizeType * num_of_chars_read,errInfoType * err_info)189 static striType read_and_alloc_utf8_stri (cFileType inFile, memSizeType chars_missing,
190 memSizeType *num_of_chars_read, errInfoType *err_info)
191
192 {
193 ucharType buffer[BUFFER_SIZE + 6];
194 memSizeType bytes_in_buffer;
195 memSizeType result_pos;
196 memSizeType new_size;
197 striType resized_result;
198 readStateType state = {0, 0, 1, 0};
199 striType result;
200
201 /* read_and_alloc_utf8_stri */
202 logFunction(printf("read_and_alloc_utf8_stri(%d, " FMT_U_MEM ", *, *)\n",
203 safe_fileno(inFile), chars_missing););
204 if (unlikely(!ALLOC_STRI_SIZE_OK(result, GETS_STRI_SIZE_DELTA))) {
205 *err_info = MEMORY_ERROR;
206 result = NULL;
207 } else {
208 result->size = GETS_STRI_SIZE_DELTA;
209 for (result_pos = 0;
210 chars_missing >= BUFFER_SIZE - state.bytes_missing + state.chars_there &&
211 (state.chars_read > 0 || state.chars_there) &&
212 *err_info == OKAY_NO_ERROR;
213 result_pos += state.chars_read, chars_missing -= state.chars_read) {
214 bytes_in_buffer = (memSizeType) fread(&buffer[state.bytes_remaining], 1,
215 BUFFER_SIZE, inFile);
216 if (bytes_in_buffer == 0 && result_pos == 0 && ferror(inFile)) {
217 logError(printf("read_and_alloc_utf8_stri: "
218 "fread(*, 1, " FMT_U_MEM ", %d) failed.\n",
219 (memSizeType) LIST_BUFFER_SIZE, safe_fileno(inFile)););
220 *err_info = FILE_ERROR;
221 } else {
222 /* printf("#A# bytes_in_buffer=%d num_of_chars_read=%d\n",
223 bytes_in_buffer, result_pos); */
224 if (result_pos + bytes_in_buffer > result->size) {
225 new_size = result->size + GETS_STRI_SIZE_DELTA;
226 REALLOC_STRI_CHECK_SIZE(resized_result, result, result->size, new_size);
227 if (resized_result == NULL) {
228 *err_info = MEMORY_ERROR;
229 return result;
230 } else {
231 result = resized_result;
232 COUNT3_STRI(result->size, new_size);
233 result->size = new_size;
234 } /* if */
235 } /* if */
236 bytes_to_strelements(buffer, bytes_in_buffer, &result->mem[result_pos],
237 &state, err_info);
238 } /* if */
239 } /* for */
240 for (; chars_missing > 0 && (state.chars_read > 0 || state.chars_there) &&
241 *err_info == OKAY_NO_ERROR;
242 result_pos += state.chars_read, chars_missing -= state.chars_read) {
243 bytes_in_buffer = (memSizeType) fread(&buffer[state.bytes_remaining], 1,
244 chars_missing - state.chars_there + state.bytes_missing, inFile);
245 if (bytes_in_buffer == 0 && result_pos == 0 && ferror(inFile)) {
246 logError(printf("read_and_alloc_utf8_stri: "
247 "fread(*, 1, " FMT_U_MEM ", %d) failed.\n",
248 chars_missing - state.chars_there + state.bytes_missing,
249 safe_fileno(inFile)););
250 *err_info = FILE_ERROR;
251 } else {
252 /* printf("#B# bytes_in_buffer=%d chars_missing=%d chars_read=%d "
253 "chars_there=%d bytes_missing=%d num_of_chars_read=%d\n",
254 bytes_in_buffer, chars_missing, chars_read, chars_there,
255 state.bytes_missing, result_pos); */
256 if (result_pos + bytes_in_buffer > result->size) {
257 new_size = result->size + GETS_STRI_SIZE_DELTA;
258 REALLOC_STRI_CHECK_SIZE(resized_result, result, result->size, new_size);
259 if (resized_result == NULL) {
260 *err_info = MEMORY_ERROR;
261 return result;
262 } else {
263 result = resized_result;
264 COUNT3_STRI(result->size, new_size);
265 result->size = new_size;
266 } /* if */
267 } /* if */
268 bytes_to_strelements(buffer, bytes_in_buffer, &result->mem[result_pos],
269 &state, err_info);
270 } /* if */
271 } /* for */
272 *num_of_chars_read = result_pos;
273 } /* if */
274 logFunction(printf("read_and_alloc_utf8_stri(%d, " FMT_U_MEM ", " FMT_U_MEM ", %d) -->\n",
275 safe_fileno(inFile), chars_missing, *num_of_chars_read, *err_info););
276 return result;
277 } /* read_and_alloc_utf8_stri */
278
279
280
281 /**
282 * Read a character from an UTF-8 file.
283 * @return the character read, or EOF at the end of the file.
284 * @exception RANGE_ERROR The file contains an illegal encoding.
285 */
ut8Getc(fileType inFile)286 charType ut8Getc (fileType inFile)
287
288 {
289 cFileType cInFile;
290 int character;
291 charType result;
292
293 /* ut8Getc */
294 cInFile = inFile->cFile;
295 if (unlikely(cInFile == NULL)) {
296 logError(printf("ut8Getc: Attempt to read from closed file.\n"););
297 raise_error(FILE_ERROR);
298 return 0;
299 } /* if */
300 character = getc(cInFile);
301 if (character != EOF && character >= 0x80) {
302 /* character range 0x80 to 0xFF (128 to 255) */
303 if (unlikely(character <= 0xBF)) {
304 /* character range 0xC0 to 0xBF (128 to 191) */
305 logError(printf("ut8Getc(%d): "
306 "Unexpected UTF-8 continuation byte ('\\16#%02x;').\n",
307 safe_fileno(cInFile), character););
308 raise_error(RANGE_ERROR);
309 return 0;
310 } else if (character <= 0xDF) {
311 /* character range 192 to 223 (leading bits 110.....) */
312 result = (charType) (character & 0x1F) << 6;
313 character = getc(cInFile);
314 if (character >= 0x80 && character <= 0xBF) {
315 /* character range 128 to 191 (leading bits 10......) */
316 result |= character & 0x3F;
317 if (unlikely(result <= 0x7F)) {
318 logError(printf("ut8Getc(%d): "
319 "Overlong encodings are illegal "
320 "('\\16#" FMT_X32 ";').\n",
321 safe_fileno(cInFile), result););
322 raise_error(RANGE_ERROR);
323 return 0;
324 } else {
325 /* correct encodings are in the range */
326 /* 0x80 to 0x07FF (128 to 2047) */
327 } /* if */
328 } else {
329 logError(printf("ut8Getc(%d): "
330 "UTF-8 continuation byte expected "
331 "(found '\\16#%02x;').\n",
332 safe_fileno(cInFile), character););
333 raise_error(RANGE_ERROR);
334 return 0;
335 } /* if */
336 } else if (character <= 0xEF) {
337 /* character range 224 to 239 (leading bits 1110....) */
338 result = (charType) (character & 0x0F) << 12;
339 character = getc(cInFile);
340 if (character >= 0x80 && character <= 0xBF) {
341 /* character range 128 to 191 (leading bits 10......) */
342 result |= (charType) (character & 0x3F) << 6;
343 character = getc(cInFile);
344 if (character >= 0x80 && character <= 0xBF) {
345 result |= character & 0x3F;
346 if (unlikely(result <= 0x7FF)) {
347 /* (result >= 0xD800 && result <= 0xDFFF)) */
348 logError(printf("ut8Getc(%d): "
349 "Overlong encodings are illegal "
350 "('\\16#" FMT_X32 ";').\n",
351 safe_fileno(cInFile), result););
352 raise_error(RANGE_ERROR);
353 return 0;
354 } else {
355 /* correct encodings are in the range */
356 /* 0x800 to 0xFFFF (2048 to 65535) */
357 } /* if */
358 } else {
359 logError(printf("ut8Getc(%d): "
360 "UTF-8 continuation byte expected "
361 "(found '\\16#%02x;').\n",
362 safe_fileno(cInFile), character););
363 raise_error(RANGE_ERROR);
364 return 0;
365 } /* if */
366 } else {
367 logError(printf("ut8Getc(%d): "
368 "UTF-8 continuation byte expected "
369 "(found '\\16#%02x;').\n",
370 safe_fileno(cInFile), character););
371 raise_error(RANGE_ERROR);
372 return 0;
373 } /* if */
374 } else if (character <= 0xF7) {
375 /* character range 240 to 247 (leading bits 11110...) */
376 result = (charType) (character & 0x07) << 18;
377 character = getc(cInFile);
378 if (character >= 0x80 && character <= 0xBF) {
379 /* character range 128 to 191 (leading bits 10......) */
380 result |= (charType) (character & 0x3F) << 12;
381 character = getc(cInFile);
382 if (character >= 0x80 && character <= 0xBF) {
383 result |= (charType) (character & 0x3F) << 6;
384 character = getc(cInFile);
385 if (character >= 0x80 && character <= 0xBF) {
386 result |= character & 0x3F;
387 if (unlikely(result <= 0xFFFF)) {
388 logError(printf("ut8Getc(%d): "
389 "Overlong encodings are illegal "
390 "('\\16#" FMT_X32 ";').\n",
391 safe_fileno(cInFile), result););
392 raise_error(RANGE_ERROR);
393 return 0;
394 } else {
395 /* correct encodings are in the range */
396 /* 0x10000 to 0x10FFFF (65536 to 1114111) */
397 /* allowed encodings are in the range */
398 /* 0x110000 to 0x1FFFFF (1114112 to 2097151) */
399 } /* if */
400 } else {
401 logError(printf("ut8Getc(%d): "
402 "UTF-8 continuation byte expected "
403 "(found '\\16#%02x;').\n",
404 safe_fileno(cInFile), character););
405 raise_error(RANGE_ERROR);
406 return 0;
407 } /* if */
408 } else {
409 logError(printf("ut8Getc(%d): "
410 "UTF-8 continuation byte expected "
411 "(found '\\16#%02x;').\n",
412 safe_fileno(cInFile), character););
413 raise_error(RANGE_ERROR);
414 return 0;
415 } /* if */
416 } else {
417 logError(printf("ut8Getc(%d): "
418 "UTF-8 continuation byte expected "
419 "(found '\\16#%02x;').\n",
420 safe_fileno(cInFile), character););
421 raise_error(RANGE_ERROR);
422 return 0;
423 } /* if */
424 } else if (character <= 0xFB) {
425 /* character range 248 to 251 (leading bits 111110..) */
426 result = (charType) (character & 0x03) << 24;
427 character = getc(cInFile);
428 if (character >= 0x80 && character <= 0xBF) {
429 /* character range 128 to 191 (leading bits 10......) */
430 result |= (charType) (character & 0x3F) << 18;
431 character = getc(cInFile);
432 if (character >= 0x80 && character <= 0xBF) {
433 result |= (charType) (character & 0x3F) << 12;
434 character = getc(cInFile);
435 if (character >= 0x80 && character <= 0xBF) {
436 result |= (charType) (character & 0x3F) << 6;
437 character = getc(cInFile);
438 if (character >= 0x80 && character <= 0xBF) {
439 result |= character & 0x3F;
440 if (unlikely(result <= 0x1FFFFF)) {
441 logError(printf("ut8Getc(%d): "
442 "Overlong encodings are illegal "
443 "('\\16#" FMT_X32 ";').\n",
444 safe_fileno(cInFile), result););
445 raise_error(RANGE_ERROR);
446 return 0;
447 } else {
448 /* allowed encodings are in the range */
449 /* 0x200000 to 0x3FFFFFF (2097152 to 67108863) */
450 } /* if */
451 } else {
452 logError(printf("ut8Getc(%d): "
453 "UTF-8 continuation byte expected "
454 "(found '\\16#%02x;').\n",
455 safe_fileno(cInFile), character););
456 raise_error(RANGE_ERROR);
457 return 0;
458 } /* if */
459 } else {
460 logError(printf("ut8Getc(%d): "
461 "UTF-8 continuation byte expected "
462 "(found '\\16#%02x;').\n",
463 safe_fileno(cInFile), character););
464 raise_error(RANGE_ERROR);
465 return 0;
466 } /* if */
467 } else {
468 logError(printf("ut8Getc(%d): "
469 "UTF-8 continuation byte expected "
470 "(found '\\16#%02x;').\n",
471 safe_fileno(cInFile), character););
472 raise_error(RANGE_ERROR);
473 return 0;
474 } /* if */
475 } else {
476 logError(printf("ut8Getc(%d): "
477 "UTF-8 continuation byte expected "
478 "(found '\\16#%02x;').\n",
479 safe_fileno(cInFile), character););
480 raise_error(RANGE_ERROR);
481 return 0;
482 } /* if */
483 } else { /* if (character <= 0xFF) { */
484 /* character range 252 to 255 (leading bits 111111..) */
485 result = (charType) (character & 0x03) << 30;
486 character = getc(cInFile);
487 if (character >= 0x80 && character <= 0xBF) {
488 /* character range 128 to 191 (leading bits 10......) */
489 result |= (charType) (character & 0x3F) << 24;
490 character = getc(cInFile);
491 if (character >= 0x80 && character <= 0xBF) {
492 result |= (charType) (character & 0x3F) << 18;
493 character = getc(cInFile);
494 if (character >= 0x80 && character <= 0xBF) {
495 result |= (charType) (character & 0x3F) << 12;
496 character = getc(cInFile);
497 if (character >= 0x80 && character <= 0xBF) {
498 result |= (charType) (character & 0x3F) << 6;
499 character = getc(cInFile);
500 if (character >= 0x80 && character <= 0xBF) {
501 result |= character & 0x3F;
502 if (unlikely(result <= 0x3FFFFFF)) {
503 logError(printf("ut8Getc(%d): "
504 "Overlong encodings are illegal "
505 "('\\16#" FMT_X32 ";').\n",
506 safe_fileno(cInFile), result););
507 raise_error(RANGE_ERROR);
508 return 0;
509 } else {
510 /* allowed encodings are in the range */
511 /* 0x4000000 to 0xFFFFFFFF (67108864 to 4294967295) */
512 } /* if */
513 } else {
514 logError(printf("ut8Getc(%d): "
515 "UTF-8 continuation byte expected "
516 "(found '\\16#%02x;').\n",
517 safe_fileno(cInFile), character););
518 raise_error(RANGE_ERROR);
519 return 0;
520 } /* if */
521 } else {
522 logError(printf("ut8Getc(%d): "
523 "UTF-8 continuation byte expected "
524 "(found '\\16#%02x;').\n",
525 safe_fileno(cInFile), character););
526 raise_error(RANGE_ERROR);
527 return 0;
528 } /* if */
529 } else {
530 logError(printf("ut8Getc(%d): "
531 "UTF-8 continuation byte expected "
532 "(found '\\16#%02x;').\n",
533 safe_fileno(cInFile), character););
534 raise_error(RANGE_ERROR);
535 return 0;
536 } /* if */
537 } else {
538 logError(printf("ut8Getc(%d): "
539 "UTF-8 continuation byte expected "
540 "(found '\\16#%02x;').\n",
541 safe_fileno(cInFile), character););
542 raise_error(RANGE_ERROR);
543 return 0;
544 } /* if */
545 } else {
546 logError(printf("ut8Getc(%d): "
547 "UTF-8 continuation byte expected "
548 "(found '\\16#%02x;').\n",
549 safe_fileno(cInFile), character););
550 raise_error(RANGE_ERROR);
551 return 0;
552 } /* if */
553 } /* if */
554 } else {
555 result = (charType) (scharType) character;
556 } /* if */
557 return result;
558 } /* ut8Getc */
559
560
561
562 /**
563 * Read a string with 'length' characters from an UTF-8 file.
564 * In order to work reasonable good for the common case (reading
565 * just some characters) memory for 'length' characters is requested
566 * with malloc(). After the data is read the result string is
567 * shrunk to the actual size (with realloc()). If 'length' is
568 * larger than GETS_DEFAULT_SIZE or the memory cannot be requested
569 * a different strategy is used. In this case the function tries to
570 * find out the number of available characters (this is possible
571 * for a regular file but not for a pipe). If this fails a third
572 * strategy is used. In this case a smaller block is requested. This
573 * block is filled with data, resized and filled in a loop.
574 * @return the string read.
575 * @exception RANGE_ERROR The length is negative or the file
576 * contains an illegal encoding.
577 */
ut8Gets(fileType inFile,intType length)578 striType ut8Gets (fileType inFile, intType length)
579
580 {
581 cFileType cInFile;
582 memSizeType chars_requested;
583 memSizeType bytes_there;
584 memSizeType allocated_size;
585 errInfoType err_info = OKAY_NO_ERROR;
586 memSizeType num_of_chars_read;
587 striType resized_result;
588 striType result;
589
590 /* ut8Gets */
591 logFunction(printf("ut8Gets(%s%d, " FMT_D ")\n",
592 inFile == NULL ? "NULL " : "",
593 inFile != NULL ? safe_fileno(inFile->cFile) : 0,
594 length););
595 cInFile = inFile->cFile;
596 if (unlikely(cInFile == NULL)) {
597 logError(printf("ut8Gets: Attempt to read from closed file.\n"););
598 raise_error(FILE_ERROR);
599 result = NULL;
600 } else if (unlikely(length <= 0)) {
601 if (unlikely(length != 0)) {
602 logError(printf("ut8Gets(%d, " FMT_D "): Negative length.\n",
603 safe_fileno(cInFile), length););
604 raise_error(RANGE_ERROR);
605 result = NULL;
606 } else {
607 if (unlikely(!ALLOC_STRI_SIZE_OK(result, 0))) {
608 raise_error(MEMORY_ERROR);
609 } else {
610 result->size = 0;
611 } /* if */
612 } /* if */
613 } else {
614 if ((uintType) length > MAX_MEMSIZETYPE) {
615 chars_requested = MAX_MEMSIZETYPE;
616 } else {
617 chars_requested = (memSizeType) length;
618 } /* if */
619 if (chars_requested > GETS_DEFAULT_SIZE) {
620 /* Avoid requesting too much */
621 result = NULL;
622 } else {
623 allocated_size = chars_requested;
624 (void) ALLOC_STRI_SIZE_OK(result, allocated_size);
625 } /* if */
626 if (result == NULL) {
627 bytes_there = remainingBytesInFile(cInFile);
628 /* printf("bytes_there=" FMT_U_MEM "\n", bytes_there); */
629 if (bytes_there != 0) {
630 /* Now we know that bytes_there bytes are available in cInFile */
631 if (chars_requested <= bytes_there) {
632 allocated_size = chars_requested;
633 } else {
634 allocated_size = bytes_there;
635 } /* if */
636 /* printf("allocated_size=" FMT_U_MEM "\n", allocated_size); */
637 if (unlikely(!ALLOC_STRI_CHECK_SIZE(result, allocated_size))) {
638 /* printf("MAX_STRI_LEN=%lu, SIZ_STRI(MAX_STRI_LEN)=%lu\n",
639 MAX_STRI_LEN, SIZ_STRI(MAX_STRI_LEN)); */
640 raise_error(MEMORY_ERROR);
641 return NULL;
642 } /* if */
643 } /* if */
644 } /* if */
645 if (result != NULL) {
646 /* We have allocated a buffer for the requested number of chars
647 or for the number of bytes which are available in the file */
648 result->size = allocated_size;
649 num_of_chars_read = read_utf8_string(cInFile, result, &err_info);
650 } else {
651 /* We do not know how many bytes are available therefore
652 result is resized with GETS_STRI_SIZE_DELTA until we
653 have read enough or we reach EOF */
654 result = read_and_alloc_utf8_stri(cInFile, chars_requested, &num_of_chars_read,
655 &err_info);
656 } /* if */
657 if (unlikely(err_info != OKAY_NO_ERROR)) {
658 if (result != NULL) {
659 FREE_STRI(result, result->size);
660 } /* if */
661 raise_error(err_info);
662 result = NULL;
663 } else if (num_of_chars_read < result->size) {
664 REALLOC_STRI_SIZE_SMALLER(resized_result, result, result->size, num_of_chars_read);
665 if (unlikely(resized_result == NULL)) {
666 FREE_STRI(result, result->size);
667 raise_error(MEMORY_ERROR);
668 result = NULL;
669 } else {
670 result = resized_result;
671 COUNT3_STRI(result->size, num_of_chars_read);
672 result->size = num_of_chars_read;
673 } /* if */
674 } /* if */
675 } /* if */
676 logFunction(printf("ut8Gets(%d, " FMT_D ") --> \"%s\"\n",
677 safe_fileno(cInFile), length, striAsUnquotedCStri(result)););
678 return result;
679 } /* ut8Gets */
680
681
682
683 /**
684 * Read a line from an UTF-8 file.
685 * The function accepts lines ending with '\n', "\r\n" or EOF.
686 * The line ending characters are not copied into the string.
687 * That means that the '\r' of a "\r\n" sequence is silently removed.
688 * When the function is left terminationChar contains '\n' or EOF.
689 * @return the line read.
690 * @exception RANGE_ERROR The file contains an illegal encoding.
691 * @exception MEMORY_ERROR Not enough memory to represent the result.
692 * @exception FILE_ERROR A system function returns an error.
693 */
ut8LineRead(fileType inFile,charType * terminationChar)694 striType ut8LineRead (fileType inFile, charType *terminationChar)
695
696 {
697 cFileType cInFile;
698 register int ch;
699 register memSizeType position;
700 ucharType *memory;
701 memSizeType memlength;
702 memSizeType newmemlength;
703 bstriType resized_buffer;
704 bstriType buffer;
705 memSizeType result_size;
706 striType resized_result;
707 striType result;
708
709 /* ut8LineRead */
710 logFunction(printf("ut8LineRead(%s%d, '\\" FMT_U32 ";')\n",
711 inFile == NULL ? "NULL " : "",
712 inFile != NULL ? safe_fileno(inFile->cFile) : 0,
713 *terminationChar););
714 cInFile = inFile->cFile;
715 if (unlikely(cInFile == NULL)) {
716 logError(printf("ut8LineRead: Attempt to read from closed file.\n"););
717 raise_error(FILE_ERROR);
718 result = NULL;
719 } else {
720 memlength = READ_STRI_INIT_SIZE;
721 if (unlikely(!ALLOC_BSTRI_SIZE_OK(buffer, memlength))) {
722 raise_error(MEMORY_ERROR);
723 result = NULL;
724 } else {
725 memory = buffer->mem;
726 position = 0;
727 flockfile(cInFile);
728 while ((ch = getc_unlocked(cInFile)) != (int) '\n' && ch != EOF) {
729 if (position >= memlength) {
730 newmemlength = memlength + READ_STRI_SIZE_DELTA;
731 REALLOC_BSTRI_CHECK_SIZE(resized_buffer, buffer, memlength, newmemlength);
732 if (unlikely(resized_buffer == NULL)) {
733 FREE_BSTRI(buffer, memlength);
734 funlockfile(cInFile);
735 raise_error(MEMORY_ERROR);
736 return NULL;
737 } /* if */
738 buffer = resized_buffer;
739 COUNT3_BSTRI(memlength, newmemlength);
740 memory = buffer->mem;
741 memlength = newmemlength;
742 } /* if */
743 memory[position++] = (ucharType) ch;
744 } /* while */
745 funlockfile(cInFile);
746 if (ch == (int) '\n' && position != 0 && memory[position - 1] == '\r') {
747 position--;
748 } /* if */
749 if (unlikely(ch == EOF && position == 0 && ferror(cInFile))) {
750 FREE_BSTRI(buffer, memlength);
751 logError(printf("ut8LineRead(%d, '\\" FMT_U32 ";'): "
752 "getc_unlocked(%d) failed:\n"
753 "errno=%d\nerror: %s\n",
754 safe_fileno(cInFile), *terminationChar,
755 safe_fileno(cInFile), errno, strerror(errno)););
756 raise_error(FILE_ERROR);
757 result = NULL;
758 } else {
759 if (unlikely(!ALLOC_STRI_CHECK_SIZE(result, position))) {
760 FREE_BSTRI(buffer, memlength);
761 raise_error(MEMORY_ERROR);
762 } else {
763 if (unlikely(utf8_to_stri(result->mem, &result_size, buffer->mem, position) != 0)) {
764 FREE_BSTRI(buffer, memlength);
765 FREE_STRI(result, position);
766 logError(printf("ut8LineRead(%d, '\\" FMT_U32 ";'): "
767 "The file contains an illegal encoding.\n",
768 safe_fileno(cInFile), *terminationChar););
769 raise_error(RANGE_ERROR);
770 result = NULL;
771 } else {
772 FREE_BSTRI(buffer, memlength);
773 REALLOC_STRI_SIZE_OK(resized_result, result, position, result_size);
774 if (unlikely(resized_result == NULL)) {
775 FREE_STRI(result, position);
776 raise_error(MEMORY_ERROR);
777 result = NULL;
778 } else {
779 result = resized_result;
780 COUNT3_STRI(position, result_size);
781 result->size = result_size;
782 *terminationChar = (charType) ch;
783 } /* if */
784 } /* if */
785 } /* if */
786 } /* if */
787 } /* if */
788 } /* if */
789 logFunction(printf("ut8LineRead(%d, '\\" FMT_U32 ";') --> \"%s\"\n",
790 safe_fileno(cInFile), *terminationChar,
791 striAsUnquotedCStri(result)););
792 return result;
793 } /* ut8LineRead */
794
795
796
797 /**
798 * Set the current file position.
799 * The file position is measured in bytes from the start of the file.
800 * The first byte in the file has the position 1.
801 * If the file position would be in the middle of an UTF-8 encoded
802 * character the position is advanced to the beginning of the next
803 * UTF-8 character.
804 * @exception RANGE_ERROR The file position is negative or zero or
805 * the file position is not representable in the system
806 * file position type.
807 * @exception FILE_ERROR The system function returns an error.
808 */
ut8Seek(fileType aFile,intType position)809 void ut8Seek (fileType aFile, intType position)
810
811 {
812 cFileType cFile;
813 int ch;
814 int seekCorrection;
815
816 /* ut8Seek */
817 logFunction(printf("ut8Seek(%s%d, " FMT_D ")\n",
818 aFile == NULL ? "NULL " : "",
819 aFile != NULL ? safe_fileno(aFile->cFile) : 0,
820 position););
821 cFile = aFile->cFile;
822 if (unlikely(cFile == NULL)) {
823 logError(printf("ut8Seek: Attempt to set the current position of a closed file.\n"););
824 raise_error(FILE_ERROR);
825 } else if (unlikely(position <= 0)) {
826 logError(printf("ut8Seek(%d, " FMT_D "): Position <= 0.\n",
827 safe_fileno(cFile), position););
828 raise_error(RANGE_ERROR);
829 #if OS_OFF_T_SIZE < INTTYPE_SIZE
830 #if OS_OFF_T_SIZE == 32
831 } else if (unlikely(position > INT32TYPE_MAX)) {
832 logError(printf("ut8Seek(%d, " FMT_D "): "
833 "Position not representable in the system file position type.\n",
834 safe_fileno(cFile), position););
835 raise_error(RANGE_ERROR);
836 #elif OS_OFF_T_SIZE == 64
837 } else if (unlikely(position > INT64TYPE_MAX)) {
838 logError(printf("ut8Seek(%d, " FMT_D "): "
839 "Position not representable in the system file position type.\n",
840 safe_fileno(cFile), position););
841 raise_error(RANGE_ERROR);
842 #else
843 #error "sizeof(os_off_t) is neither 4 nor 8."
844 #endif
845 #endif
846 } else if (unlikely(offsetSeek(cFile, (os_off_t) (position - 1), SEEK_SET) != 0)) {
847 logError(printf("ut8Seek(%d, " FMT_D "): "
848 "offsetSeek(%d, " FMT_D ", SEEK_SET) failed.\n"
849 "errno=%d\nerror: %s\n",
850 safe_fileno(cFile), position,
851 safe_fileno(cFile), position - 1,
852 errno, strerror(errno)););
853 raise_error(FILE_ERROR);
854 } else {
855 while ((ch = getc(cFile)) != EOF &&
856 ch >= 0x80 && ch <= 0xBF) ;
857 if (ch != EOF) {
858 seekCorrection = -1;
859 } else {
860 seekCorrection = 0;
861 } /* if */
862 /* According to the specification of file I/O input */
863 /* shall not be directly followed by output without */
864 /* an intervening call to a file positioning function */
865 /* (e.g. fseek()). For this reason a seek is done to */
866 /* allow that a write can directly follow ut8Seek(). */
867 if (unlikely(offsetSeek(cFile, (os_off_t) seekCorrection, SEEK_CUR) != 0)) {
868 logError(printf("ut8Seek(%d, " FMT_D "): "
869 "offsetSeek(%d, %d, SEEK_CUR) failed.\n"
870 "errno=%d\nerror: %s\n",
871 safe_fileno(cFile), position,
872 safe_fileno(cFile), seekCorrection,
873 errno, strerror(errno)););
874 raise_error(FILE_ERROR);
875 } /* if */
876 } /* if */
877 } /* ut8Seek */
878
879
880
881 /**
882 * Read a word from an UTF-8 file.
883 * Before reading the word it skips spaces and tabs. The function
884 * accepts words ending with ' ', '\t', '\n', "\r\n" or EOF.
885 * The word ending characters are not copied into the string.
886 * That means that the '\r' of a "\r\n" sequence is silently removed.
887 * When the function is left terminationChar contains ' ', '\t', '\n' or
888 * EOF.
889 * @return the word read.
890 * @exception RANGE_ERROR The file contains an illegal encoding.
891 * @exception MEMORY_ERROR Not enough memory to represent the result.
892 * @exception FILE_ERROR A system function returns an error.
893 */
ut8WordRead(fileType inFile,charType * terminationChar)894 striType ut8WordRead (fileType inFile, charType *terminationChar)
895
896 {
897 cFileType cInFile;
898 register int ch;
899 register memSizeType position;
900 ucharType *memory;
901 memSizeType memlength;
902 memSizeType newmemlength;
903 bstriType resized_buffer;
904 bstriType buffer;
905 memSizeType result_size;
906 striType resized_result;
907 striType result;
908
909 /* ut8WordRead */
910 logFunction(printf("ut8WordRead(%s%d, '\\" FMT_U32 ";')\n",
911 inFile == NULL ? "NULL " : "",
912 inFile != NULL ? safe_fileno(inFile->cFile) : 0,
913 *terminationChar););
914 cInFile = inFile->cFile;
915 if (unlikely(cInFile == NULL)) {
916 logError(printf("ut8WordRead: Attempt to read from closed file.\n"););
917 raise_error(FILE_ERROR);
918 result = NULL;
919 } else {
920 memlength = READ_STRI_INIT_SIZE;
921 if (unlikely(!ALLOC_BSTRI_SIZE_OK(buffer, memlength))) {
922 raise_error(MEMORY_ERROR);
923 result = NULL;
924 } else {
925 memory = buffer->mem;
926 position = 0;
927 flockfile(cInFile);
928 do {
929 ch = getc_unlocked(cInFile);
930 } while (ch == (int) ' ' || ch == (int) '\t');
931 while (ch != (int) ' ' && ch != (int) '\t' &&
932 ch != (int) '\n' && ch != EOF) {
933 if (position >= memlength) {
934 newmemlength = memlength + READ_STRI_SIZE_DELTA;
935 REALLOC_BSTRI_CHECK_SIZE(resized_buffer, buffer, memlength, newmemlength);
936 if (unlikely(resized_buffer == NULL)) {
937 FREE_BSTRI(buffer, memlength);
938 funlockfile(cInFile);
939 raise_error(MEMORY_ERROR);
940 return NULL;
941 } /* if */
942 buffer = resized_buffer;
943 COUNT3_BSTRI(memlength, newmemlength);
944 memory = buffer->mem;
945 memlength = newmemlength;
946 } /* if */
947 memory[position++] = (ucharType) ch;
948 ch = getc_unlocked(cInFile);
949 } /* while */
950 funlockfile(cInFile);
951 if (ch == (int) '\n' && position != 0 && memory[position - 1] == '\r') {
952 position--;
953 } /* if */
954 if (unlikely(ch == EOF && position == 0 && ferror(cInFile))) {
955 FREE_BSTRI(buffer, memlength);
956 logError(printf("ut8WordRead(%d, '\\" FMT_U32 ";'): "
957 "getc_unlocked(%d) failed:\n"
958 "errno=%d\nerror: %s\n",
959 safe_fileno(cInFile), *terminationChar,
960 safe_fileno(cInFile), errno, strerror(errno)););
961 raise_error(FILE_ERROR);
962 result = NULL;
963 } else {
964 if (unlikely(!ALLOC_STRI_CHECK_SIZE(result, position))) {
965 FREE_BSTRI(buffer, memlength);
966 raise_error(MEMORY_ERROR);
967 } else {
968 if (unlikely(utf8_to_stri(result->mem, &result_size, buffer->mem, position) != 0)) {
969 FREE_BSTRI(buffer, memlength);
970 FREE_STRI(result, position);
971 logError(printf("ut8WordRead(%d, '\\" FMT_U32 ";'): "
972 "The file contains an illegal encoding.\n",
973 safe_fileno(cInFile), *terminationChar););
974 raise_error(RANGE_ERROR);
975 result = NULL;
976 } else {
977 FREE_BSTRI(buffer, memlength);
978 REALLOC_STRI_SIZE_OK(resized_result, result, position, result_size);
979 if (unlikely(resized_result == NULL)) {
980 FREE_STRI(result, position);
981 raise_error(MEMORY_ERROR);
982 result = NULL;
983 } else {
984 result = resized_result;
985 COUNT3_STRI(position, result_size);
986 result->size = result_size;
987 *terminationChar = (charType) ch;
988 } /* if */
989 } /* if */
990 } /* if */
991 } /* if */
992 } /* if */
993 } /* if */
994 logFunction(printf("ut8WordRead(%d, '\\" FMT_U32 ";') --> \"%s\"\n",
995 safe_fileno(cInFile), *terminationChar,
996 striAsUnquotedCStri(result)););
997 return result;
998 } /* ut8WordRead */
999
1000
1001
1002 /**
1003 * Write a string to an UTF-8 file.
1004 * @exception FILE_ERROR A system function returns an error.
1005 */
ut8Write(fileType outFile,const const_striType stri)1006 void ut8Write (fileType outFile, const const_striType stri)
1007
1008 {
1009 cFileType cOutFile;
1010 const strElemType *str;
1011 memSizeType len;
1012 memSizeType size;
1013 ucharType stri_buffer[max_utf8_size(WRITE_STRI_BLOCK_SIZE)];
1014
1015 /* ut8Write */
1016 logFunction(printf("ut8Write(%s%d, \"%s\")\n",
1017 outFile == NULL ? "NULL " : "",
1018 outFile != NULL ? safe_fileno(outFile->cFile) : 0,
1019 striAsUnquotedCStri(stri)););
1020 cOutFile = outFile->cFile;
1021 if (unlikely(cOutFile == NULL)) {
1022 logError(printf("ut8Write: Attempt to write to closed file.\n"););
1023 raise_error(FILE_ERROR);
1024 return;
1025 } /* if */
1026 #if FWRITE_WRONG_FOR_READ_ONLY_FILES
1027 if (unlikely(stri->size > 0 && (cOutFile->flags & _F_WRIT) == 0)) {
1028 logError(printf("ut8Write: Attempt to write to read only file: %d.\n",
1029 safe_fileno(cOutFile)););
1030 raise_error(FILE_ERROR);
1031 return;
1032 } /* if */
1033 #endif
1034 for (str = stri->mem, len = stri->size; len >= WRITE_STRI_BLOCK_SIZE;
1035 str += WRITE_STRI_BLOCK_SIZE, len -= WRITE_STRI_BLOCK_SIZE) {
1036 size = stri_to_utf8(stri_buffer, str, WRITE_STRI_BLOCK_SIZE);
1037 if (unlikely(size != fwrite(stri_buffer, 1, (size_t) size, cOutFile))) {
1038 logError(printf("ut8Write: fwrite(*, 1, " FMT_U_MEM ", %d) failed:\n"
1039 "errno=%d\nerror: %s\n",
1040 size, safe_fileno(cOutFile),
1041 errno, strerror(errno)););
1042 raise_error(FILE_ERROR);
1043 return;
1044 } /* if */
1045 } /* for */
1046 if (len > 0) {
1047 size = stri_to_utf8(stri_buffer, str, len);
1048 if (unlikely(size != fwrite(stri_buffer, 1, (size_t) size, cOutFile))) {
1049 logError(printf("ut8Write: fwrite(*, 1, " FMT_U_MEM ", %d) failed:\n"
1050 "errno=%d\nerror: %s\n",
1051 size, safe_fileno(cOutFile),
1052 errno, strerror(errno)););
1053 raise_error(FILE_ERROR);
1054 return;
1055 } /* if */
1056 } /* if */
1057 logFunction(printf("ut8Write -->\n"););
1058 } /* ut8Write */
1059