1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1998-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *
11 * File read.c
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   05/26/99    stephen     Creation.
17 *   5/10/01     Ram         removed ustdio dependency
18 *******************************************************************************
19 */
20 
21 #include "read.h"
22 #include "errmsg.h"
23 #include "toolutil.h"
24 #include "unicode/ustring.h"
25 #include "unicode/utf16.h"
26 
27 #define OPENBRACE    0x007B
28 #define CLOSEBRACE   0x007D
29 #define COMMA        0x002C
30 #define QUOTE        0x0022
31 #define ESCAPE       0x005C
32 #define SLASH        0x002F
33 #define ASTERISK     0x002A
34 #define SPACE        0x0020
35 #define COLON        0x003A
36 #define BADBOM       0xFFFE
37 #define CR           0x000D
38 #define LF           0x000A
39 
40 static int32_t lineCount;
41 
42 /* Protos */
43 static enum ETokenType getStringToken(UCHARBUF *buf,
44                                       UChar32 initialChar,
45                                       struct UString *token,
46                                       UErrorCode *status);
47 
48 static UChar32 getNextChar           (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status);
49 static void    seekUntilNewline      (UCHARBUF *buf, struct UString *token, UErrorCode *status);
50 static void    seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status);
51 static UBool   isWhitespace          (UChar32 c);
52 static UBool   isNewline             (UChar32 c);
53 
resetLineNumber()54 U_CFUNC void resetLineNumber() {
55     lineCount = 1;
56 }
57 
58 /* Read and return the next token from the stream.  If the token is of
59    type eString, fill in the token parameter with the token.  If the
60    token is eError, then the status parameter will contain the
61    specific error.  This will be eItemNotFound at the end of file,
62    indicating that all tokens have been returned.  This method will
63    never return eString twice in a row; instead, multiple adjacent
64    string tokens will be merged into one, with no intervening
65    space. */
66 U_CFUNC enum ETokenType
getNextToken(UCHARBUF * buf,struct UString * token,uint32_t * linenumber,struct UString * comment,UErrorCode * status)67 getNextToken(UCHARBUF* buf,
68              struct UString *token,
69              uint32_t *linenumber, /* out: linenumber of token */
70              struct UString *comment,
71              UErrorCode *status) {
72     enum ETokenType result;
73     UChar32         c;
74 
75     if (U_FAILURE(*status)) {
76         return TOK_ERROR;
77     }
78 
79     /* Skip whitespace */
80     c = getNextChar(buf, TRUE, comment, status);
81 
82     if (U_FAILURE(*status)) {
83         return TOK_ERROR;
84     }
85 
86     *linenumber = lineCount;
87 
88     switch(c) {
89     case BADBOM:
90         return TOK_ERROR;
91     case OPENBRACE:
92         return TOK_OPEN_BRACE;
93     case CLOSEBRACE:
94         return TOK_CLOSE_BRACE;
95     case COMMA:
96         return TOK_COMMA;
97     case U_EOF:
98         return TOK_EOF;
99     case COLON:
100         return TOK_COLON;
101 
102     default:
103         result = getStringToken(buf, c, token, status);
104     }
105 
106     *linenumber = lineCount;
107     return result;
108 }
109 
110 /* Copy a string token into the given UnicodeString.  Upon entry, we
111    have already read the first character of the string token, which is
112    not a whitespace character (but may be a QUOTE or ESCAPE). This
113    function reads all subsequent characters that belong with this
114    string, and copy them into the token parameter. The other
115    important, and slightly convoluted purpose of this function is to
116    merge adjacent strings.  It looks forward a bit, and if the next
117    non comment, non whitespace item is a string, it reads it in as
118    well.  If two adjacent strings are quoted, they are merged without
119    intervening space.  Otherwise a single SPACE character is
120    inserted. */
getStringToken(UCHARBUF * buf,UChar32 initialChar,struct UString * token,UErrorCode * status)121 static enum ETokenType getStringToken(UCHARBUF* buf,
122                                       UChar32 initialChar,
123                                       struct UString *token,
124                                       UErrorCode *status) {
125     UBool    lastStringWasQuoted;
126     UChar32  c;
127     UChar    target[3] = { '\0' };
128     UChar    *pTarget   = target;
129     int      len=0;
130     UBool    isFollowingCharEscaped=FALSE;
131     UBool    isNLUnescaped = FALSE;
132     UChar32  prevC=0;
133 
134     /* We are guaranteed on entry that initialChar is not a whitespace
135        character. If we are at the EOF, or have some other problem, it
136        doesn't matter; we still want to validly return the initialChar
137        (if nothing else) as a string token. */
138 
139     if (U_FAILURE(*status)) {
140         return TOK_ERROR;
141     }
142 
143     /* setup */
144     lastStringWasQuoted = FALSE;
145     c = initialChar;
146     ustr_setlen(token, 0, status);
147 
148     if (U_FAILURE(*status)) {
149         return TOK_ERROR;
150     }
151 
152     for (;;) {
153         if (c == QUOTE) {
154             if (!lastStringWasQuoted && token->fLength > 0) {
155                 ustr_ucat(token, SPACE, status);
156 
157                 if (U_FAILURE(*status)) {
158                     return TOK_ERROR;
159                 }
160             }
161 
162             lastStringWasQuoted = TRUE;
163 
164             for (;;) {
165                 c = ucbuf_getc(buf,status);
166 
167                 /* EOF reached */
168                 if (c == U_EOF) {
169                     return TOK_EOF;
170                 }
171 
172                 /* Unterminated quoted strings */
173                 if (U_FAILURE(*status)) {
174                     return TOK_ERROR;
175                 }
176 
177                 if (c == QUOTE && !isFollowingCharEscaped) {
178                     break;
179                 }
180 
181                 if (c == ESCAPE  && !isFollowingCharEscaped) {
182                     pTarget = target;
183                     c       = unescape(buf, status);
184 
185                     if (c == U_ERR) {
186                         return TOK_ERROR;
187                     }
188                     if(c == CR || c == LF){
189                         isNLUnescaped = TRUE;
190                     }
191                 }
192 
193                 if(c==ESCAPE && !isFollowingCharEscaped){
194                     isFollowingCharEscaped = TRUE;
195                 }else{
196                     U_APPEND_CHAR32(c, pTarget,len);
197                     pTarget = target;
198                     ustr_uscat(token, pTarget,len, status);
199                     isFollowingCharEscaped = FALSE;
200                     len=0;
201                     if(c == CR || c == LF){
202                         if(isNLUnescaped == FALSE && prevC!=CR){
203                             lineCount++;
204                         }
205                         isNLUnescaped = FALSE;
206                     }
207                 }
208 
209                 if (U_FAILURE(*status)) {
210                     return TOK_ERROR;
211                 }
212                 prevC = c;
213             }
214         } else {
215             if (token->fLength > 0) {
216                 ustr_ucat(token, SPACE, status);
217 
218                 if (U_FAILURE(*status)) {
219                     return TOK_ERROR;
220                 }
221             }
222 
223             if(lastStringWasQuoted){
224                 if(getShowWarning()){
225                     warning(lineCount, "Mixing quoted and unquoted strings");
226                 }
227                 if(isStrict()){
228                     return TOK_ERROR;
229                 }
230 
231             }
232 
233             lastStringWasQuoted = FALSE;
234 
235             /* if we reach here we are mixing
236              * quoted and unquoted strings
237              * warn in normal mode and error in
238              * pedantic mode
239              */
240 
241             if (c == ESCAPE) {
242                 pTarget = target;
243                 c       = unescape(buf, status);
244 
245                 /* EOF reached */
246                 if (c == U_EOF) {
247                     return TOK_ERROR;
248                 }
249             }
250 
251             U_APPEND_CHAR32(c, pTarget,len);
252             pTarget = target;
253             ustr_uscat(token, pTarget,len, status);
254             len=0;
255 
256             if (U_FAILURE(*status)) {
257                 return TOK_ERROR;
258             }
259 
260             for (;;) {
261                 /* DON'T skip whitespace */
262                 c = getNextChar(buf, FALSE, NULL, status);
263 
264                 /* EOF reached */
265                 if (c == U_EOF) {
266                     ucbuf_ungetc(c, buf);
267                     return TOK_STRING;
268                 }
269 
270                 if (U_FAILURE(*status)) {
271                     return TOK_STRING;
272                 }
273 
274                 if (c == QUOTE
275                         || c == OPENBRACE
276                         || c == CLOSEBRACE
277                         || c == COMMA
278                         || c == COLON) {
279                     ucbuf_ungetc(c, buf);
280                     break;
281                 }
282 
283                 if (isWhitespace(c)) {
284                     break;
285                 }
286 
287                 if (c == ESCAPE) {
288                     pTarget = target;
289                     c       = unescape(buf, status);
290 
291                     if (c == U_ERR) {
292                         return TOK_ERROR;
293                     }
294                 }
295 
296                 U_APPEND_CHAR32(c, pTarget,len);
297                 pTarget = target;
298                 ustr_uscat(token, pTarget,len, status);
299                 len=0;
300                 if (U_FAILURE(*status)) {
301                     return TOK_ERROR;
302                 }
303             }
304         }
305 
306         /* DO skip whitespace */
307         c = getNextChar(buf, TRUE, NULL, status);
308 
309         if (U_FAILURE(*status)) {
310             return TOK_STRING;
311         }
312 
313         if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) {
314             ucbuf_ungetc(c, buf);
315             return TOK_STRING;
316         }
317     }
318 }
319 
320 /* Retrieve the next character.  If skipwhite is
321    true, whitespace is skipped as well. */
getNextChar(UCHARBUF * buf,UBool skipwhite,struct UString * token,UErrorCode * status)322 static UChar32 getNextChar(UCHARBUF* buf,
323                            UBool skipwhite,
324                            struct UString *token,
325                            UErrorCode *status) {
326     UChar32 c, c2;
327 
328     if (U_FAILURE(*status)) {
329         return U_EOF;
330     }
331 
332     for (;;) {
333         c = ucbuf_getc(buf,status);
334 
335         if (c == U_EOF) {
336             return U_EOF;
337         }
338 
339         if (skipwhite && isWhitespace(c)) {
340             continue;
341         }
342 
343         /* This also handles the get() failing case */
344         if (c != SLASH) {
345             return c;
346         }
347 
348         c = ucbuf_getc(buf,status); /* "/c" */
349 
350         if (c == U_EOF) {
351             return U_EOF;
352         }
353 
354         switch (c) {
355         case SLASH:  /* "//" */
356             seekUntilNewline(buf, NULL, status);
357             break;
358 
359         case ASTERISK:  /* " / * " */
360             c2 = ucbuf_getc(buf, status); /* "/ * c" */
361             if(c2 == ASTERISK){  /* "/ * *" */
362                 /* parse multi-line comment and store it in token*/
363                 seekUntilEndOfComment(buf, token, status);
364             } else {
365                 ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *".  Include c2  back in buffer.  */
366                 seekUntilEndOfComment(buf, NULL, status);
367             }
368             break;
369 
370         default:
371             ucbuf_ungetc(c, buf); /* "/c" - put back the c */
372             /* If get() failed this is a NOP */
373             return SLASH;
374         }
375 
376     }
377 }
378 
seekUntilNewline(UCHARBUF * buf,struct UString * token,UErrorCode * status)379 static void seekUntilNewline(UCHARBUF* buf,
380                              struct UString *token,
381                              UErrorCode *status) {
382     UChar32 c;
383 
384     if (U_FAILURE(*status)) {
385         return;
386     }
387 
388     do {
389         c = ucbuf_getc(buf,status);
390         /* add the char to token */
391         if(token!=NULL){
392             ustr_u32cat(token, c, status);
393         }
394     } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR);
395 }
396 
seekUntilEndOfComment(UCHARBUF * buf,struct UString * token,UErrorCode * status)397 static void seekUntilEndOfComment(UCHARBUF *buf,
398                                   struct UString *token,
399                                   UErrorCode *status) {
400     UChar32  c, d;
401     uint32_t line;
402 
403     if (U_FAILURE(*status)) {
404         return;
405     }
406 
407     line = lineCount;
408 
409     do {
410         c = ucbuf_getc(buf, status);
411 
412         if (c == ASTERISK) {
413             d = ucbuf_getc(buf, status);
414 
415             if (d != SLASH) {
416                 ucbuf_ungetc(d, buf);
417             } else {
418                 break;
419             }
420         }
421         /* add the char to token */
422         if(token!=NULL){
423             ustr_u32cat(token, c, status);
424         }
425         /* increment the lineCount */
426         isNewline(c);
427 
428     } while (c != U_EOF && *status == U_ZERO_ERROR);
429 
430     if (c == U_EOF) {
431         *status = U_INVALID_FORMAT_ERROR;
432         error(line, "unterminated comment detected");
433     }
434 }
435 
unescape(UCHARBUF * buf,UErrorCode * status)436 U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) {
437     if (U_FAILURE(*status)) {
438         return U_EOF;
439     }
440 
441     /* We expect to be called after the ESCAPE has been seen, but
442      * u_fgetcx needs an ESCAPE to do its magic. */
443     ucbuf_ungetc(ESCAPE, buf);
444 
445     return ucbuf_getcx32(buf, status);
446 }
447 
isWhitespace(UChar32 c)448 static UBool isWhitespace(UChar32 c) {
449     switch (c) {
450         /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */
451     case 0x000A:
452     case 0x2029:
453         lineCount++;
454     case 0x000D:
455     case 0x0020:
456     case 0x0009:
457     case 0xFEFF:
458         return TRUE;
459 
460     default:
461         return FALSE;
462     }
463 }
464 
isNewline(UChar32 c)465 static UBool isNewline(UChar32 c) {
466     switch (c) {
467         /* '\n', '\r', 0x2029 */
468     case 0x000A:
469     case 0x2029:
470         lineCount++;
471     case 0x000D:
472         return TRUE;
473 
474     default:
475         return FALSE;
476     }
477 }
478