1 /********************************************************************/
2 /*                                                                  */
3 /*  ut8_rtl.c     Primitive actions for the UTF-8 file type.        */
4 /*  Copyright (C) 1989 - 2015, 2018  Thomas Mertes                  */
5 /*                                                                  */
6 /*  This file is part of the Seed7 Runtime Library.                 */
7 /*                                                                  */
8 /*  The Seed7 Runtime Library is free software; you can             */
9 /*  redistribute it and/or modify it under the terms of the GNU     */
10 /*  Lesser General Public License as published by the Free Software */
11 /*  Foundation; either version 2.1 of the License, or (at your      */
12 /*  option) any later version.                                      */
13 /*                                                                  */
14 /*  The Seed7 Runtime Library is distributed in the hope that it    */
15 /*  will be useful, but WITHOUT ANY WARRANTY; without even the      */
16 /*  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR */
17 /*  PURPOSE.  See the GNU Lesser General Public License for more    */
18 /*  details.                                                        */
19 /*                                                                  */
20 /*  You should have received a copy of the GNU Lesser General       */
21 /*  Public License along with this program; if not, write to the    */
22 /*  Free Software Foundation, Inc., 51 Franklin Street,             */
23 /*  Fifth Floor, Boston, MA  02110-1301, USA.                       */
24 /*                                                                  */
25 /*  Module: Seed7 Runtime Library                                   */
26 /*  File: seed7/src/ut8_rtl.c                                       */
27 /*  Changes: 2005, 2010, 2013, 2014  Thomas Mertes                  */
28 /*  Content: Primitive actions for the UTF-8 file type.             */
29 /*                                                                  */
30 /********************************************************************/
31 
32 #define LOG_FUNCTIONS 0
33 #define VERBOSE_EXCEPTIONS 0
34 
35 #include "version.h"
36 
37 #include "stdlib.h"
38 #include "stdio.h"
39 #include "string.h"
40 #include "sys/types.h"
41 #include "errno.h"
42 
43 #include "common.h"
44 #include "os_decls.h"
45 #include "heaputl.h"
46 #include "striutl.h"
47 #include "fil_rtl.h"
48 #include "rtl_err.h"
49 
50 #undef EXTERN
51 #define EXTERN
52 #include "ut8_rtl.h"
53 
54 
55 #define BUFFER_SIZE             2048
56 #define GETS_DEFAULT_SIZE    1048576
57 #define GETS_STRI_SIZE_DELTA    4096
58 #define READ_STRI_INIT_SIZE      256
59 #define READ_STRI_SIZE_DELTA    2048
60 #define WRITE_STRI_BLOCK_SIZE    512
61 
62 
63 typedef struct {
64     memSizeType bytes_remaining;
65     memSizeType bytes_missing;
66     memSizeType chars_read;
67     memSizeType chars_there;
68   } readStateType;
69 
70 
71 
bytes_to_strelements(ustriType buffer,memSizeType bytes_in_buffer,strElemType * stri_dest,readStateType * state,errInfoType * err_info)72 static inline void bytes_to_strelements (ustriType buffer, memSizeType bytes_in_buffer,
73     strElemType *stri_dest, readStateType *state, errInfoType *err_info)
74 
75   { /* bytes_to_strelements */
76     if (bytes_in_buffer != 0) {
77       bytes_in_buffer += state->bytes_remaining;
78       /* printf("#1# bytes_in_buffer=%d %X %X\n", bytes_in_buffer, buffer[0], buffer[1]); */
79       state->bytes_remaining = utf8_to_stri(stri_dest, &state->chars_read, buffer,
80                                             bytes_in_buffer);
81       if (state->bytes_remaining != 0) {
82         /* printf("#2# bytes_remaining=%d %X\n", state->bytes_remaining,
83             buffer[bytes_in_buffer - state->bytes_remaining]); */
84         state->bytes_missing = utf8_bytes_missing(
85             &buffer[bytes_in_buffer - state->bytes_remaining],
86             state->bytes_remaining);
87         /* printf("#3# bytes_missing=%d\n", state->bytes_missing); */
88         if (state->bytes_missing != 0) {
89           memmove(buffer, &buffer[bytes_in_buffer - state->bytes_remaining],
90                   state->bytes_remaining);
91           /* printf("#4# %X %X\n", buffer[0], buffer[1]); */
92           state->chars_there = 1;
93         } else {
94           /* printf("#5# bytes_in_buffer=%d bytes_remaining=%d bytes_missing=%d "
95               "chars_requested=%d chars_missing=%d %X ftell=%ld\n",
96               bytes_in_buffer, state->bytes_remaining, state->bytes_missing,
97               chars_requested, chars_missing,
98               buffer[bytes_in_buffer - state->bytes_remaining],
99               ftell(aFile)); */
100           *err_info = RANGE_ERROR;
101           return;
102         } /* if */
103       } else {
104         state->bytes_missing = 0;
105         state->chars_there = 0;
106       } /* if */
107     } else {
108       state->chars_read = 0;
109     } /* if */
110     /* printf("#6# chars_read=%d\n", state->chars_read); */
111   } /* bytes_to_strelements */
112 
113 
114 
115 /**
116  *  Read UTF-8 characters from 'inFile' into the allocated string.
117  *  Read until the string 'stri' is filled or 'inFile' reaches EOF.
118  *  @param inFile File from which UTF-8 encoded characters are read.
119  *  @param stri An allocated string for the requested number of chars.
120  *  @param err_info Unchanged if the function succeeds, and
121  *                  RANGE_ERROR if inFile contains illegal encodings, and
122  *                  FILE_ERROR if a system function returns an error.
123  *  @return the actual number of characters read.
124  */
read_utf8_string(cFileType inFile,striType stri,errInfoType * err_info)125 static memSizeType read_utf8_string (cFileType inFile, striType stri, errInfoType *err_info)
126 
127   {
128     ucharType buffer[BUFFER_SIZE + 6];
129     memSizeType bytes_in_buffer;
130     memSizeType stri_pos;
131     memSizeType chars_missing;
132     readStateType state = {0, 0, 1, 0};
133 
134   /* read_utf8_string */
135     for (stri_pos = 0, chars_missing = stri->size;
136         chars_missing >= BUFFER_SIZE - state.bytes_missing + state.chars_there &&
137         (state.chars_read > 0 || state.chars_there) &&
138         *err_info == OKAY_NO_ERROR;
139         stri_pos += state.chars_read, chars_missing -= state.chars_read) {
140       bytes_in_buffer = (memSizeType) fread(&buffer[state.bytes_remaining], 1,
141           BUFFER_SIZE, inFile);
142       if (bytes_in_buffer == 0 && stri_pos == 0 && ferror(inFile)) {
143         logError(printf("read_utf8_string: fread(*, 1, " FMT_U_MEM ", %d) failed.\n",
144                         (memSizeType) BUFFER_SIZE, safe_fileno(inFile)););
145         *err_info = FILE_ERROR;
146       } else {
147         /* printf("#A# bytes_in_buffer=%d num_of_chars_read=%d\n",
148             bytes_in_buffer, stri_pos); */
149         bytes_to_strelements(buffer, bytes_in_buffer, &stri->mem[stri_pos],
150              &state, err_info);
151       } /* if */
152     } /* for */
153     for (; chars_missing > 0 && (state.chars_read > 0 || state.chars_there) &&
154         *err_info == OKAY_NO_ERROR;
155         stri_pos += state.chars_read, chars_missing -= state.chars_read) {
156       bytes_in_buffer = (memSizeType) fread(&buffer[state.bytes_remaining], 1,
157           chars_missing - state.chars_there + state.bytes_missing, inFile);
158       if (bytes_in_buffer == 0 && stri_pos == 0 && ferror(inFile)) {
159         logError(printf("read_utf8_string: fread(*, 1, " FMT_U_MEM ", %d) failed.\n",
160                         chars_missing - state.chars_there + state.bytes_missing,
161                         safe_fileno(inFile)););
162         *err_info = FILE_ERROR;
163       } else {
164         /* printf("#B# bytes_in_buffer=%d chars_missing=%d chars_read=%d "
165             "chars_there=%d bytes_missing=%d num_of_chars_read=%d\n",
166             bytes_in_buffer, chars_missing, chars_read, chars_there,
167             state.bytes_missing, stri_pos); */
168         bytes_to_strelements(buffer, bytes_in_buffer, &stri->mem[stri_pos],
169              &state, err_info);
170       } /* if */
171     } /* for */
172     return stri_pos;
173   } /* read_utf8_string */
174 
175 
176 
177 /**
178  *  Read up to 'chars_missing' UTF-8 characters from 'inFile'.
179  *  Read until 'chars_missing' characters are read or 'inFile' reaches EOF.
180  *  @param inFile File from which UTF-8 encoded characters are read.
181  *  @param chars_missing Maximum number of characters to be read.
182  *  @param num_of_chars_read Address to which the actual number of
183  *         characters read is assigned.
184  *  @param err_info Unchanged if the function succeeds, and
185  *                  RANGE_ERROR if inFile contains illegal encodings, and
186  *                  FILE_ERROR if a system function returns an error, and
187  *                  MEMORY_ERROR if there was not enough memory.
188  */
read_and_alloc_utf8_stri(cFileType inFile,memSizeType chars_missing,memSizeType * num_of_chars_read,errInfoType * err_info)189 static striType read_and_alloc_utf8_stri (cFileType inFile, memSizeType chars_missing,
190     memSizeType *num_of_chars_read, errInfoType *err_info)
191 
192   {
193     ucharType buffer[BUFFER_SIZE + 6];
194     memSizeType bytes_in_buffer;
195     memSizeType result_pos;
196     memSizeType new_size;
197     striType resized_result;
198     readStateType state = {0, 0, 1, 0};
199     striType result;
200 
201   /* read_and_alloc_utf8_stri */
202     logFunction(printf("read_and_alloc_utf8_stri(%d, " FMT_U_MEM ", *, *)\n",
203                        safe_fileno(inFile), chars_missing););
204     if (unlikely(!ALLOC_STRI_SIZE_OK(result, GETS_STRI_SIZE_DELTA))) {
205       *err_info = MEMORY_ERROR;
206       result = NULL;
207     } else {
208       result->size = GETS_STRI_SIZE_DELTA;
209       for (result_pos = 0;
210           chars_missing >= BUFFER_SIZE - state.bytes_missing + state.chars_there &&
211           (state.chars_read > 0 || state.chars_there) &&
212           *err_info == OKAY_NO_ERROR;
213           result_pos += state.chars_read, chars_missing -= state.chars_read) {
214         bytes_in_buffer = (memSizeType) fread(&buffer[state.bytes_remaining], 1,
215             BUFFER_SIZE, inFile);
216         if (bytes_in_buffer == 0 && result_pos == 0 && ferror(inFile)) {
217           logError(printf("read_and_alloc_utf8_stri: "
218                           "fread(*, 1, " FMT_U_MEM ", %d) failed.\n",
219                           (memSizeType) LIST_BUFFER_SIZE, safe_fileno(inFile)););
220           *err_info = FILE_ERROR;
221         } else {
222           /* printf("#A# bytes_in_buffer=%d num_of_chars_read=%d\n",
223               bytes_in_buffer, result_pos); */
224           if (result_pos + bytes_in_buffer > result->size) {
225             new_size = result->size + GETS_STRI_SIZE_DELTA;
226             REALLOC_STRI_CHECK_SIZE(resized_result, result, result->size, new_size);
227             if (resized_result == NULL) {
228               *err_info = MEMORY_ERROR;
229               return result;
230             } else {
231               result = resized_result;
232               COUNT3_STRI(result->size, new_size);
233               result->size = new_size;
234             } /* if */
235           } /* if */
236           bytes_to_strelements(buffer, bytes_in_buffer, &result->mem[result_pos],
237               &state, err_info);
238         } /* if */
239       } /* for */
240       for (; chars_missing > 0 && (state.chars_read > 0 || state.chars_there) &&
241           *err_info == OKAY_NO_ERROR;
242           result_pos += state.chars_read, chars_missing -= state.chars_read) {
243         bytes_in_buffer = (memSizeType) fread(&buffer[state.bytes_remaining], 1,
244             chars_missing - state.chars_there + state.bytes_missing, inFile);
245         if (bytes_in_buffer == 0 && result_pos == 0 && ferror(inFile)) {
246           logError(printf("read_and_alloc_utf8_stri: "
247                           "fread(*, 1, " FMT_U_MEM ", %d) failed.\n",
248                           chars_missing - state.chars_there + state.bytes_missing,
249                           safe_fileno(inFile)););
250           *err_info = FILE_ERROR;
251         } else {
252           /* printf("#B# bytes_in_buffer=%d chars_missing=%d chars_read=%d "
253               "chars_there=%d bytes_missing=%d num_of_chars_read=%d\n",
254               bytes_in_buffer, chars_missing, chars_read, chars_there,
255               state.bytes_missing, result_pos); */
256           if (result_pos + bytes_in_buffer > result->size) {
257             new_size = result->size + GETS_STRI_SIZE_DELTA;
258             REALLOC_STRI_CHECK_SIZE(resized_result, result, result->size, new_size);
259             if (resized_result == NULL) {
260               *err_info = MEMORY_ERROR;
261               return result;
262             } else {
263               result = resized_result;
264               COUNT3_STRI(result->size, new_size);
265               result->size = new_size;
266             } /* if */
267           } /* if */
268           bytes_to_strelements(buffer, bytes_in_buffer, &result->mem[result_pos],
269               &state, err_info);
270         } /* if */
271       } /* for */
272       *num_of_chars_read = result_pos;
273     } /* if */
274     logFunction(printf("read_and_alloc_utf8_stri(%d, " FMT_U_MEM ", " FMT_U_MEM ", %d) -->\n",
275                        safe_fileno(inFile), chars_missing, *num_of_chars_read, *err_info););
276     return result;
277   } /* read_and_alloc_utf8_stri */
278 
279 
280 
281 /**
282  *  Read a character from an UTF-8 file.
283  *  @return the character read, or EOF at the end of the file.
284  *  @exception RANGE_ERROR The file contains an illegal encoding.
285  */
ut8Getc(fileType inFile)286 charType ut8Getc (fileType inFile)
287 
288   {
289     cFileType cInFile;
290     int character;
291     charType result;
292 
293   /* ut8Getc */
294     cInFile = inFile->cFile;
295     if (unlikely(cInFile == NULL)) {
296       logError(printf("ut8Getc: Attempt to read from closed file.\n"););
297       raise_error(FILE_ERROR);
298       return 0;
299     } /* if */
300     character = getc(cInFile);
301     if (character != EOF && character >= 0x80) {
302       /* character range 0x80 to 0xFF (128 to 255) */
303       if (unlikely(character <= 0xBF)) {
304         /* character range 0xC0 to 0xBF (128 to 191) */
305         logError(printf("ut8Getc(%d): "
306                         "Unexpected UTF-8 continuation byte ('\\16#%02x;').\n",
307                         safe_fileno(cInFile), character););
308         raise_error(RANGE_ERROR);
309         return 0;
310       } else if (character <= 0xDF) {
311         /* character range 192 to 223 (leading bits 110.....) */
312         result = (charType) (character & 0x1F) << 6;
313         character = getc(cInFile);
314         if (character >= 0x80 && character <= 0xBF) {
315           /* character range 128 to 191 (leading bits 10......) */
316           result |= character & 0x3F;
317           if (unlikely(result <= 0x7F)) {
318             logError(printf("ut8Getc(%d): "
319                             "Overlong encodings are illegal "
320                             "('\\16#" FMT_X32 ";').\n",
321                             safe_fileno(cInFile), result););
322             raise_error(RANGE_ERROR);
323             return 0;
324           } else {
325             /* correct encodings are in the range */
326             /* 0x80 to 0x07FF (128 to 2047)       */
327           } /* if */
328         } else {
329           logError(printf("ut8Getc(%d): "
330                           "UTF-8 continuation byte expected "
331                           "(found '\\16#%02x;').\n",
332                           safe_fileno(cInFile), character););
333           raise_error(RANGE_ERROR);
334           return 0;
335         } /* if */
336       } else if (character <= 0xEF) {
337         /* character range 224 to 239 (leading bits 1110....) */
338         result = (charType) (character & 0x0F) << 12;
339         character = getc(cInFile);
340         if (character >= 0x80 && character <= 0xBF) {
341           /* character range 128 to 191 (leading bits 10......) */
342           result |= (charType) (character & 0x3F) << 6;
343           character = getc(cInFile);
344           if (character >= 0x80 && character <= 0xBF) {
345             result |= character & 0x3F;
346             if (unlikely(result <= 0x7FF)) {
347               /* (result >= 0xD800 && result <= 0xDFFF)) */
348               logError(printf("ut8Getc(%d): "
349                               "Overlong encodings are illegal "
350                               "('\\16#" FMT_X32 ";').\n",
351                               safe_fileno(cInFile), result););
352               raise_error(RANGE_ERROR);
353               return 0;
354             } else {
355               /* correct encodings are in the range */
356               /* 0x800 to 0xFFFF (2048 to 65535)    */
357             } /* if */
358           } else {
359             logError(printf("ut8Getc(%d): "
360                             "UTF-8 continuation byte expected "
361                             "(found '\\16#%02x;').\n",
362                             safe_fileno(cInFile), character););
363             raise_error(RANGE_ERROR);
364             return 0;
365           } /* if */
366         } else {
367           logError(printf("ut8Getc(%d): "
368                           "UTF-8 continuation byte expected "
369                           "(found '\\16#%02x;').\n",
370                           safe_fileno(cInFile), character););
371           raise_error(RANGE_ERROR);
372           return 0;
373         } /* if */
374       } else if (character <= 0xF7) {
375         /* character range 240 to 247 (leading bits 11110...) */
376         result = (charType) (character & 0x07) << 18;
377         character = getc(cInFile);
378         if (character >= 0x80 && character <= 0xBF) {
379           /* character range 128 to 191 (leading bits 10......) */
380           result |= (charType) (character & 0x3F) << 12;
381           character = getc(cInFile);
382           if (character >= 0x80 && character <= 0xBF) {
383             result |= (charType) (character & 0x3F) << 6;
384             character = getc(cInFile);
385             if (character >= 0x80 && character <= 0xBF) {
386               result |= character & 0x3F;
387               if (unlikely(result <= 0xFFFF)) {
388                 logError(printf("ut8Getc(%d): "
389                                 "Overlong encodings are illegal "
390                                 "('\\16#" FMT_X32 ";').\n",
391                                 safe_fileno(cInFile), result););
392                 raise_error(RANGE_ERROR);
393                 return 0;
394               } else {
395                 /* correct encodings are in the range        */
396                 /* 0x10000 to 0x10FFFF (65536 to 1114111)    */
397                 /* allowed encodings are in the range        */
398                 /* 0x110000 to 0x1FFFFF (1114112 to 2097151) */
399               } /* if */
400             } else {
401               logError(printf("ut8Getc(%d): "
402                               "UTF-8 continuation byte expected "
403                               "(found '\\16#%02x;').\n",
404                               safe_fileno(cInFile), character););
405               raise_error(RANGE_ERROR);
406               return 0;
407             } /* if */
408           } else {
409             logError(printf("ut8Getc(%d): "
410                             "UTF-8 continuation byte expected "
411                             "(found '\\16#%02x;').\n",
412                             safe_fileno(cInFile), character););
413             raise_error(RANGE_ERROR);
414             return 0;
415           } /* if */
416         } else {
417           logError(printf("ut8Getc(%d): "
418                           "UTF-8 continuation byte expected "
419                           "(found '\\16#%02x;').\n",
420                           safe_fileno(cInFile), character););
421           raise_error(RANGE_ERROR);
422           return 0;
423         } /* if */
424       } else if (character <= 0xFB) {
425         /* character range 248 to 251 (leading bits 111110..) */
426         result = (charType) (character & 0x03) << 24;
427         character = getc(cInFile);
428         if (character >= 0x80 && character <= 0xBF) {
429           /* character range 128 to 191 (leading bits 10......) */
430           result |= (charType) (character & 0x3F) << 18;
431           character = getc(cInFile);
432           if (character >= 0x80 && character <= 0xBF) {
433             result |= (charType) (character & 0x3F) << 12;
434             character = getc(cInFile);
435             if (character >= 0x80 && character <= 0xBF) {
436               result |= (charType) (character & 0x3F) << 6;
437               character = getc(cInFile);
438               if (character >= 0x80 && character <= 0xBF) {
439                 result |= character & 0x3F;
440                 if (unlikely(result <= 0x1FFFFF)) {
441                   logError(printf("ut8Getc(%d): "
442                                   "Overlong encodings are illegal "
443                                   "('\\16#" FMT_X32 ";').\n",
444                                   safe_fileno(cInFile), result););
445                   raise_error(RANGE_ERROR);
446                   return 0;
447                 } else {
448                   /* allowed encodings are in the range          */
449                   /* 0x200000 to 0x3FFFFFF (2097152 to 67108863) */
450                 } /* if */
451               } else {
452                 logError(printf("ut8Getc(%d): "
453                                 "UTF-8 continuation byte expected "
454                                 "(found '\\16#%02x;').\n",
455                                 safe_fileno(cInFile), character););
456                 raise_error(RANGE_ERROR);
457                 return 0;
458               } /* if */
459             } else {
460               logError(printf("ut8Getc(%d): "
461                               "UTF-8 continuation byte expected "
462                               "(found '\\16#%02x;').\n",
463                               safe_fileno(cInFile), character););
464               raise_error(RANGE_ERROR);
465               return 0;
466             } /* if */
467           } else {
468             logError(printf("ut8Getc(%d): "
469                             "UTF-8 continuation byte expected "
470                             "(found '\\16#%02x;').\n",
471                             safe_fileno(cInFile), character););
472             raise_error(RANGE_ERROR);
473             return 0;
474           } /* if */
475         } else {
476           logError(printf("ut8Getc(%d): "
477                           "UTF-8 continuation byte expected "
478                           "(found '\\16#%02x;').\n",
479                           safe_fileno(cInFile), character););
480           raise_error(RANGE_ERROR);
481           return 0;
482         } /* if */
483       } else { /* if (character <= 0xFF) { */
484         /* character range 252 to 255 (leading bits 111111..) */
485         result = (charType) (character & 0x03) << 30;
486         character = getc(cInFile);
487         if (character >= 0x80 && character <= 0xBF) {
488           /* character range 128 to 191 (leading bits 10......) */
489           result |= (charType) (character & 0x3F) << 24;
490           character = getc(cInFile);
491           if (character >= 0x80 && character <= 0xBF) {
492             result |= (charType) (character & 0x3F) << 18;
493             character = getc(cInFile);
494             if (character >= 0x80 && character <= 0xBF) {
495               result |= (charType) (character & 0x3F) << 12;
496               character = getc(cInFile);
497               if (character >= 0x80 && character <= 0xBF) {
498                 result |= (charType) (character & 0x3F) <<  6;
499                 character = getc(cInFile);
500                 if (character >= 0x80 && character <= 0xBF) {
501                   result |= character & 0x3F;
502                   if (unlikely(result <= 0x3FFFFFF)) {
503                     logError(printf("ut8Getc(%d): "
504                                     "Overlong encodings are illegal "
505                                     "('\\16#" FMT_X32 ";').\n",
506                                     safe_fileno(cInFile), result););
507                     raise_error(RANGE_ERROR);
508                     return 0;
509                   } else {
510                     /* allowed encodings are in the range               */
511                     /* 0x4000000 to 0xFFFFFFFF (67108864 to 4294967295) */
512                   } /* if */
513                 } else {
514                   logError(printf("ut8Getc(%d): "
515                                   "UTF-8 continuation byte expected "
516                                   "(found '\\16#%02x;').\n",
517                                   safe_fileno(cInFile), character););
518                   raise_error(RANGE_ERROR);
519                   return 0;
520                 } /* if */
521               } else {
522                 logError(printf("ut8Getc(%d): "
523                                 "UTF-8 continuation byte expected "
524                                 "(found '\\16#%02x;').\n",
525                                 safe_fileno(cInFile), character););
526                 raise_error(RANGE_ERROR);
527                 return 0;
528               } /* if */
529             } else {
530               logError(printf("ut8Getc(%d): "
531                               "UTF-8 continuation byte expected "
532                               "(found '\\16#%02x;').\n",
533                               safe_fileno(cInFile), character););
534               raise_error(RANGE_ERROR);
535               return 0;
536             } /* if */
537           } else {
538             logError(printf("ut8Getc(%d): "
539                             "UTF-8 continuation byte expected "
540                             "(found '\\16#%02x;').\n",
541                             safe_fileno(cInFile), character););
542             raise_error(RANGE_ERROR);
543             return 0;
544           } /* if */
545         } else {
546           logError(printf("ut8Getc(%d): "
547                           "UTF-8 continuation byte expected "
548                           "(found '\\16#%02x;').\n",
549                           safe_fileno(cInFile), character););
550           raise_error(RANGE_ERROR);
551           return 0;
552         } /* if */
553       } /* if */
554     } else {
555       result = (charType) (scharType) character;
556     } /* if */
557     return result;
558   } /* ut8Getc */
559 
560 
561 
562 /**
563  *  Read a string with 'length' characters from an UTF-8 file.
564  *  In order to work reasonable good for the common case (reading
565  *  just some characters) memory for 'length' characters is requested
566  *  with malloc(). After the data is read the result string is
567  *  shrunk to the actual size (with realloc()). If 'length' is
568  *  larger than GETS_DEFAULT_SIZE or the memory cannot be requested
569  *  a different strategy is used. In this case the function tries to
570  *  find out the number of available characters (this is possible
571  *  for a regular file but not for a pipe). If this fails a third
572  *  strategy is used. In this case a smaller block is requested. This
573  *  block is filled with data, resized and filled in a loop.
574  *  @return the string read.
575  *  @exception RANGE_ERROR The length is negative or the file
576  *             contains an illegal encoding.
577  */
ut8Gets(fileType inFile,intType length)578 striType ut8Gets (fileType inFile, intType length)
579 
580   {
581     cFileType cInFile;
582     memSizeType chars_requested;
583     memSizeType bytes_there;
584     memSizeType allocated_size;
585     errInfoType err_info = OKAY_NO_ERROR;
586     memSizeType num_of_chars_read;
587     striType resized_result;
588     striType result;
589 
590   /* ut8Gets */
591     logFunction(printf("ut8Gets(%s%d, " FMT_D ")\n",
592                        inFile == NULL ? "NULL " : "",
593                        inFile != NULL ? safe_fileno(inFile->cFile) : 0,
594                        length););
595     cInFile = inFile->cFile;
596     if (unlikely(cInFile == NULL)) {
597       logError(printf("ut8Gets: Attempt to read from closed file.\n"););
598       raise_error(FILE_ERROR);
599       result = NULL;
600     } else if (unlikely(length <= 0)) {
601       if (unlikely(length != 0)) {
602         logError(printf("ut8Gets(%d, " FMT_D "): Negative length.\n",
603                         safe_fileno(cInFile), length););
604         raise_error(RANGE_ERROR);
605         result = NULL;
606       } else {
607         if (unlikely(!ALLOC_STRI_SIZE_OK(result, 0))) {
608           raise_error(MEMORY_ERROR);
609         } else {
610           result->size = 0;
611         } /* if */
612       } /* if */
613     } else {
614       if ((uintType) length > MAX_MEMSIZETYPE) {
615         chars_requested = MAX_MEMSIZETYPE;
616       } else {
617         chars_requested = (memSizeType) length;
618       } /* if */
619       if (chars_requested > GETS_DEFAULT_SIZE) {
620         /* Avoid requesting too much */
621         result = NULL;
622       } else {
623         allocated_size = chars_requested;
624         (void) ALLOC_STRI_SIZE_OK(result, allocated_size);
625       } /* if */
626       if (result == NULL) {
627         bytes_there = remainingBytesInFile(cInFile);
628         /* printf("bytes_there=" FMT_U_MEM "\n", bytes_there); */
629         if (bytes_there != 0) {
630           /* Now we know that bytes_there bytes are available in cInFile */
631           if (chars_requested <= bytes_there) {
632             allocated_size = chars_requested;
633           } else {
634             allocated_size = bytes_there;
635           } /* if */
636           /* printf("allocated_size=" FMT_U_MEM "\n", allocated_size); */
637           if (unlikely(!ALLOC_STRI_CHECK_SIZE(result, allocated_size))) {
638             /* printf("MAX_STRI_LEN=%lu, SIZ_STRI(MAX_STRI_LEN)=%lu\n",
639                 MAX_STRI_LEN, SIZ_STRI(MAX_STRI_LEN)); */
640             raise_error(MEMORY_ERROR);
641             return NULL;
642           } /* if */
643         } /* if */
644       } /* if */
645       if (result != NULL) {
646         /* We have allocated a buffer for the requested number of chars
647            or for the number of bytes which are available in the file */
648         result->size = allocated_size;
649         num_of_chars_read = read_utf8_string(cInFile, result, &err_info);
650       } else {
651         /* We do not know how many bytes are available therefore
652            result is resized with GETS_STRI_SIZE_DELTA until we
653            have read enough or we reach EOF */
654         result = read_and_alloc_utf8_stri(cInFile, chars_requested, &num_of_chars_read,
655                                           &err_info);
656       } /* if */
657       if (unlikely(err_info != OKAY_NO_ERROR)) {
658         if (result != NULL) {
659           FREE_STRI(result, result->size);
660         } /* if */
661         raise_error(err_info);
662         result = NULL;
663       } else if (num_of_chars_read < result->size) {
664         REALLOC_STRI_SIZE_SMALLER(resized_result, result, result->size, num_of_chars_read);
665         if (unlikely(resized_result == NULL)) {
666           FREE_STRI(result, result->size);
667           raise_error(MEMORY_ERROR);
668           result = NULL;
669         } else {
670           result = resized_result;
671           COUNT3_STRI(result->size, num_of_chars_read);
672           result->size = num_of_chars_read;
673         } /* if */
674       } /* if */
675     } /* if */
676     logFunction(printf("ut8Gets(%d, " FMT_D ") --> \"%s\"\n",
677                        safe_fileno(cInFile), length, striAsUnquotedCStri(result)););
678     return result;
679   } /* ut8Gets */
680 
681 
682 
683 /**
684  *  Read a line from an UTF-8 file.
685  *  The function accepts lines ending with '\n', "\r\n" or EOF.
686  *  The line ending characters are not copied into the string.
687  *  That means that the '\r' of a "\r\n" sequence is silently removed.
688  *  When the function is left terminationChar contains '\n' or EOF.
689  *  @return the line read.
690  *  @exception RANGE_ERROR The file contains an illegal encoding.
691  *  @exception MEMORY_ERROR Not enough memory to represent the result.
692  *  @exception FILE_ERROR A system function returns an error.
693  */
ut8LineRead(fileType inFile,charType * terminationChar)694 striType ut8LineRead (fileType inFile, charType *terminationChar)
695 
696   {
697     cFileType cInFile;
698     register int ch;
699     register memSizeType position;
700     ucharType *memory;
701     memSizeType memlength;
702     memSizeType newmemlength;
703     bstriType resized_buffer;
704     bstriType buffer;
705     memSizeType result_size;
706     striType resized_result;
707     striType result;
708 
709   /* ut8LineRead */
710     logFunction(printf("ut8LineRead(%s%d, '\\" FMT_U32 ";')\n",
711                        inFile == NULL ? "NULL " : "",
712                        inFile != NULL ? safe_fileno(inFile->cFile) : 0,
713                        *terminationChar););
714     cInFile = inFile->cFile;
715     if (unlikely(cInFile == NULL)) {
716       logError(printf("ut8LineRead: Attempt to read from closed file.\n"););
717       raise_error(FILE_ERROR);
718       result = NULL;
719     } else {
720       memlength = READ_STRI_INIT_SIZE;
721       if (unlikely(!ALLOC_BSTRI_SIZE_OK(buffer, memlength))) {
722         raise_error(MEMORY_ERROR);
723         result = NULL;
724       } else {
725         memory = buffer->mem;
726         position = 0;
727         flockfile(cInFile);
728         while ((ch = getc_unlocked(cInFile)) != (int) '\n' && ch != EOF) {
729           if (position >= memlength) {
730             newmemlength = memlength + READ_STRI_SIZE_DELTA;
731             REALLOC_BSTRI_CHECK_SIZE(resized_buffer, buffer, memlength, newmemlength);
732             if (unlikely(resized_buffer == NULL)) {
733               FREE_BSTRI(buffer, memlength);
734               funlockfile(cInFile);
735               raise_error(MEMORY_ERROR);
736               return NULL;
737             } /* if */
738             buffer = resized_buffer;
739             COUNT3_BSTRI(memlength, newmemlength);
740             memory = buffer->mem;
741             memlength = newmemlength;
742           } /* if */
743           memory[position++] = (ucharType) ch;
744         } /* while */
745         funlockfile(cInFile);
746         if (ch == (int) '\n' && position != 0 && memory[position - 1] == '\r') {
747           position--;
748         } /* if */
749         if (unlikely(ch == EOF && position == 0 && ferror(cInFile))) {
750           FREE_BSTRI(buffer, memlength);
751           logError(printf("ut8LineRead(%d, '\\" FMT_U32 ";'): "
752                           "getc_unlocked(%d) failed:\n"
753                           "errno=%d\nerror: %s\n",
754                           safe_fileno(cInFile), *terminationChar,
755                           safe_fileno(cInFile), errno, strerror(errno)););
756           raise_error(FILE_ERROR);
757           result = NULL;
758         } else {
759           if (unlikely(!ALLOC_STRI_CHECK_SIZE(result, position))) {
760             FREE_BSTRI(buffer, memlength);
761             raise_error(MEMORY_ERROR);
762           } else {
763             if (unlikely(utf8_to_stri(result->mem, &result_size, buffer->mem, position) != 0)) {
764               FREE_BSTRI(buffer, memlength);
765               FREE_STRI(result, position);
766               logError(printf("ut8LineRead(%d, '\\" FMT_U32 ";'): "
767                               "The file contains an illegal encoding.\n",
768                               safe_fileno(cInFile), *terminationChar););
769               raise_error(RANGE_ERROR);
770               result = NULL;
771             } else {
772               FREE_BSTRI(buffer, memlength);
773               REALLOC_STRI_SIZE_OK(resized_result, result, position, result_size);
774               if (unlikely(resized_result == NULL)) {
775                 FREE_STRI(result, position);
776                 raise_error(MEMORY_ERROR);
777                 result = NULL;
778               } else {
779                 result = resized_result;
780                 COUNT3_STRI(position, result_size);
781                 result->size = result_size;
782                 *terminationChar = (charType) ch;
783               } /* if */
784             } /* if */
785           } /* if */
786         } /* if */
787       } /* if */
788     } /* if */
789     logFunction(printf("ut8LineRead(%d, '\\" FMT_U32 ";') --> \"%s\"\n",
790                        safe_fileno(cInFile), *terminationChar,
791                        striAsUnquotedCStri(result)););
792     return result;
793   } /* ut8LineRead */
794 
795 
796 
797 /**
798  *  Set the current file position.
799  *  The file position is measured in bytes from the start of the file.
800  *  The first byte in the file has the position 1.
801  *  If the file position would be in the middle of an UTF-8 encoded
802  *  character the position is advanced to the beginning of the next
803  *  UTF-8 character.
804  *  @exception RANGE_ERROR The file position is negative or zero or
805  *             the file position is not representable in the system
806  *             file position type.
807  *  @exception FILE_ERROR The system function returns an error.
808  */
ut8Seek(fileType aFile,intType position)809 void ut8Seek (fileType aFile, intType position)
810 
811   {
812     cFileType cFile;
813     int ch;
814     int seekCorrection;
815 
816   /* ut8Seek */
817     logFunction(printf("ut8Seek(%s%d, " FMT_D ")\n",
818                        aFile == NULL ? "NULL " : "",
819                        aFile != NULL ? safe_fileno(aFile->cFile) : 0,
820                        position););
821     cFile = aFile->cFile;
822     if (unlikely(cFile == NULL)) {
823       logError(printf("ut8Seek: Attempt to set the current position of a closed file.\n"););
824       raise_error(FILE_ERROR);
825     } else if (unlikely(position <= 0)) {
826       logError(printf("ut8Seek(%d, " FMT_D "): Position <= 0.\n",
827                       safe_fileno(cFile), position););
828       raise_error(RANGE_ERROR);
829 #if OS_OFF_T_SIZE < INTTYPE_SIZE
830 #if OS_OFF_T_SIZE == 32
831     } else if (unlikely(position > INT32TYPE_MAX)) {
832       logError(printf("ut8Seek(%d, " FMT_D "): "
833                       "Position not representable in the system file position type.\n",
834                       safe_fileno(cFile), position););
835       raise_error(RANGE_ERROR);
836 #elif OS_OFF_T_SIZE == 64
837     } else if (unlikely(position > INT64TYPE_MAX)) {
838       logError(printf("ut8Seek(%d, " FMT_D "): "
839                       "Position not representable in the system file position type.\n",
840                       safe_fileno(cFile), position););
841       raise_error(RANGE_ERROR);
842 #else
843 #error "sizeof(os_off_t) is neither 4 nor 8."
844 #endif
845 #endif
846     } else if (unlikely(offsetSeek(cFile, (os_off_t) (position - 1), SEEK_SET) != 0)) {
847       logError(printf("ut8Seek(%d, " FMT_D "): "
848                       "offsetSeek(%d, " FMT_D ", SEEK_SET) failed.\n"
849                       "errno=%d\nerror: %s\n",
850                       safe_fileno(cFile), position,
851                       safe_fileno(cFile), position - 1,
852                       errno, strerror(errno)););
853       raise_error(FILE_ERROR);
854     } else {
855       while ((ch = getc(cFile)) != EOF &&
856              ch >= 0x80 && ch <= 0xBF) ;
857       if (ch != EOF) {
858         seekCorrection = -1;
859       } else {
860         seekCorrection = 0;
861       } /* if */
862       /* According to the specification of file I/O input   */
863       /* shall not be directly followed by output without   */
864       /* an intervening call to a file positioning function */
865       /* (e.g. fseek()). For this reason a seek is done to  */
866       /* allow that a write can directly follow ut8Seek().  */
867       if (unlikely(offsetSeek(cFile, (os_off_t) seekCorrection, SEEK_CUR) != 0)) {
868         logError(printf("ut8Seek(%d, " FMT_D "): "
869                         "offsetSeek(%d, %d, SEEK_CUR) failed.\n"
870                         "errno=%d\nerror: %s\n",
871                         safe_fileno(cFile), position,
872                         safe_fileno(cFile), seekCorrection,
873                         errno, strerror(errno)););
874         raise_error(FILE_ERROR);
875       } /* if */
876     } /* if */
877   } /* ut8Seek */
878 
879 
880 
881 /**
882  *  Read a word from an UTF-8 file.
883  *  Before reading the word it skips spaces and tabs. The function
884  *  accepts words ending with ' ', '\t', '\n', "\r\n" or EOF.
885  *  The word ending characters are not copied into the string.
886  *  That means that the '\r' of a "\r\n" sequence is silently removed.
887  *  When the function is left terminationChar contains ' ', '\t', '\n' or
888  *  EOF.
889  *  @return the word read.
890  *  @exception RANGE_ERROR The file contains an illegal encoding.
891  *  @exception MEMORY_ERROR Not enough memory to represent the result.
892  *  @exception FILE_ERROR A system function returns an error.
893  */
ut8WordRead(fileType inFile,charType * terminationChar)894 striType ut8WordRead (fileType inFile, charType *terminationChar)
895 
896   {
897     cFileType cInFile;
898     register int ch;
899     register memSizeType position;
900     ucharType *memory;
901     memSizeType memlength;
902     memSizeType newmemlength;
903     bstriType resized_buffer;
904     bstriType buffer;
905     memSizeType result_size;
906     striType resized_result;
907     striType result;
908 
909   /* ut8WordRead */
910     logFunction(printf("ut8WordRead(%s%d, '\\" FMT_U32 ";')\n",
911                        inFile == NULL ? "NULL " : "",
912                        inFile != NULL ? safe_fileno(inFile->cFile) : 0,
913                        *terminationChar););
914     cInFile = inFile->cFile;
915     if (unlikely(cInFile == NULL)) {
916       logError(printf("ut8WordRead: Attempt to read from closed file.\n"););
917       raise_error(FILE_ERROR);
918       result = NULL;
919     } else {
920       memlength = READ_STRI_INIT_SIZE;
921       if (unlikely(!ALLOC_BSTRI_SIZE_OK(buffer, memlength))) {
922         raise_error(MEMORY_ERROR);
923         result = NULL;
924       } else {
925         memory = buffer->mem;
926         position = 0;
927         flockfile(cInFile);
928         do {
929           ch = getc_unlocked(cInFile);
930         } while (ch == (int) ' ' || ch == (int) '\t');
931         while (ch != (int) ' ' && ch != (int) '\t' &&
932             ch != (int) '\n' && ch != EOF) {
933           if (position >= memlength) {
934             newmemlength = memlength + READ_STRI_SIZE_DELTA;
935             REALLOC_BSTRI_CHECK_SIZE(resized_buffer, buffer, memlength, newmemlength);
936             if (unlikely(resized_buffer == NULL)) {
937               FREE_BSTRI(buffer, memlength);
938               funlockfile(cInFile);
939               raise_error(MEMORY_ERROR);
940               return NULL;
941             } /* if */
942             buffer = resized_buffer;
943             COUNT3_BSTRI(memlength, newmemlength);
944             memory = buffer->mem;
945             memlength = newmemlength;
946           } /* if */
947           memory[position++] = (ucharType) ch;
948           ch = getc_unlocked(cInFile);
949         } /* while */
950         funlockfile(cInFile);
951         if (ch == (int) '\n' && position != 0 && memory[position - 1] == '\r') {
952           position--;
953         } /* if */
954         if (unlikely(ch == EOF && position == 0 && ferror(cInFile))) {
955           FREE_BSTRI(buffer, memlength);
956           logError(printf("ut8WordRead(%d, '\\" FMT_U32 ";'): "
957                           "getc_unlocked(%d) failed:\n"
958                           "errno=%d\nerror: %s\n",
959                           safe_fileno(cInFile), *terminationChar,
960                           safe_fileno(cInFile), errno, strerror(errno)););
961           raise_error(FILE_ERROR);
962           result = NULL;
963         } else {
964           if (unlikely(!ALLOC_STRI_CHECK_SIZE(result, position))) {
965             FREE_BSTRI(buffer, memlength);
966             raise_error(MEMORY_ERROR);
967           } else {
968             if (unlikely(utf8_to_stri(result->mem, &result_size, buffer->mem, position) != 0)) {
969               FREE_BSTRI(buffer, memlength);
970               FREE_STRI(result, position);
971               logError(printf("ut8WordRead(%d, '\\" FMT_U32 ";'): "
972                               "The file contains an illegal encoding.\n",
973                               safe_fileno(cInFile), *terminationChar););
974               raise_error(RANGE_ERROR);
975               result = NULL;
976             } else {
977               FREE_BSTRI(buffer, memlength);
978               REALLOC_STRI_SIZE_OK(resized_result, result, position, result_size);
979               if (unlikely(resized_result == NULL)) {
980                 FREE_STRI(result, position);
981                 raise_error(MEMORY_ERROR);
982                 result = NULL;
983               } else {
984                 result = resized_result;
985                 COUNT3_STRI(position, result_size);
986                 result->size = result_size;
987                 *terminationChar = (charType) ch;
988               } /* if */
989             } /* if */
990           } /* if */
991         } /* if */
992       } /* if */
993     } /* if */
994     logFunction(printf("ut8WordRead(%d, '\\" FMT_U32 ";') --> \"%s\"\n",
995                        safe_fileno(cInFile), *terminationChar,
996                        striAsUnquotedCStri(result)););
997     return result;
998   } /* ut8WordRead */
999 
1000 
1001 
1002 /**
1003  *  Write a string to an UTF-8 file.
1004  *  @exception FILE_ERROR A system function returns an error.
1005  */
ut8Write(fileType outFile,const const_striType stri)1006 void ut8Write (fileType outFile, const const_striType stri)
1007 
1008   {
1009     cFileType cOutFile;
1010     const strElemType *str;
1011     memSizeType len;
1012     memSizeType size;
1013     ucharType stri_buffer[max_utf8_size(WRITE_STRI_BLOCK_SIZE)];
1014 
1015   /* ut8Write */
1016     logFunction(printf("ut8Write(%s%d, \"%s\")\n",
1017                        outFile == NULL ? "NULL " : "",
1018                        outFile != NULL ? safe_fileno(outFile->cFile) : 0,
1019                        striAsUnquotedCStri(stri)););
1020     cOutFile = outFile->cFile;
1021     if (unlikely(cOutFile == NULL)) {
1022       logError(printf("ut8Write: Attempt to write to closed file.\n"););
1023       raise_error(FILE_ERROR);
1024       return;
1025     } /* if */
1026 #if FWRITE_WRONG_FOR_READ_ONLY_FILES
1027     if (unlikely(stri->size > 0 && (cOutFile->flags & _F_WRIT) == 0)) {
1028       logError(printf("ut8Write: Attempt to write to read only file: %d.\n",
1029                       safe_fileno(cOutFile)););
1030       raise_error(FILE_ERROR);
1031       return;
1032     } /* if */
1033 #endif
1034     for (str = stri->mem, len = stri->size; len >= WRITE_STRI_BLOCK_SIZE;
1035         str += WRITE_STRI_BLOCK_SIZE, len -= WRITE_STRI_BLOCK_SIZE) {
1036       size = stri_to_utf8(stri_buffer, str, WRITE_STRI_BLOCK_SIZE);
1037       if (unlikely(size != fwrite(stri_buffer, 1, (size_t) size, cOutFile))) {
1038         logError(printf("ut8Write: fwrite(*, 1, " FMT_U_MEM ", %d) failed:\n"
1039                         "errno=%d\nerror: %s\n",
1040                         size, safe_fileno(cOutFile),
1041                         errno, strerror(errno)););
1042         raise_error(FILE_ERROR);
1043         return;
1044       } /* if */
1045     } /* for */
1046     if (len > 0) {
1047       size = stri_to_utf8(stri_buffer, str, len);
1048       if (unlikely(size != fwrite(stri_buffer, 1, (size_t) size, cOutFile))) {
1049         logError(printf("ut8Write: fwrite(*, 1, " FMT_U_MEM ", %d) failed:\n"
1050                         "errno=%d\nerror: %s\n",
1051                         size, safe_fileno(cOutFile),
1052                         errno, strerror(errno)););
1053         raise_error(FILE_ERROR);
1054         return;
1055       } /* if */
1056     } /* if */
1057     logFunction(printf("ut8Write -->\n"););
1058   } /* ut8Write */
1059