1 /*
2     This file is part of GNU APL, a free implementation of the
3     ISO/IEC Standard 13751, "Programming Language APL, Extended"
4 
5     Copyright (C) 2008-2015  Dr. Jürgen Sauermann
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef __TOKENIZER_HH_DEFINED__
22 #define __TOKENIZER_HH_DEFINED__
23 
24 #include "Token.hh"
25 #include "UCS_string.hh"
26 
27 class Token;
28 
29 //-----------------------------------------------------------------------------
30 /// An iterator for UCS_string
31 class Unicode_source
32 {
33 public:
34    /// constructor: iterate over the entire string.
Unicode_source(const UCS_string & s)35    Unicode_source(const UCS_string & s)
36    : str(s),
37    idx(0),
38    end(s.size())
39    {}
40 
41    /// constructor: iterate from \b from to \b to
Unicode_source(const Unicode_source & src,int32_t from,int32_t to)42    Unicode_source(const Unicode_source & src, int32_t from, int32_t to)
43    : str(src.str),
44      idx(src.idx + from),
45      end(src.idx + from + to)
46    {
47      if (end > src.str.size())   end = src.str.size();
48      if (idx > end)   idx = end;
49    }
50 
51    /// return the number of remaining items
rest() const52    int32_t rest() const
53       { return end - idx; }
54 
55    /// lookup next item
operator [](int32_t i) const56    const Unicode & operator[](int32_t i) const
57       { i += idx;   Assert(uint32_t(i) < uint32_t(end));   return str[i]; }
58 
59    /// get next item
get()60    const Unicode & get()
61       { Assert(idx < end);   return str[idx++]; }
62 
63    /// lookup next item without removing it
operator *() const64    const Unicode & operator *() const
65       { Assert(idx < end);   return str[idx]; }
66 
67    /// skip the first element
operator ++()68    void operator ++()
69       { Assert(idx < end);   ++idx; }
70 
71    /// undo skip of the current element
operator --()72    void operator --()
73       { Assert(idx > 0);   --idx; }
74 
75    /// shrink the source to rest \b new_rest
set_rest(int32_t new_rest)76    void set_rest(int32_t new_rest)
77       { Assert(new_rest <= rest());   end = idx + new_rest; }
78 
79    /// skip \b count elements
skip(int32_t count)80    void skip(int32_t count)
81       { idx += count;   if (idx > end)   idx = end; }
82 
83 protected:
84    /// the source string
85    const UCS_string & str;
86 
87    /// the current position
88    int32_t idx;
89 
90    /// the end position (excluding)
91    int32_t end;
92 };
93 //-----------------------------------------------------------------------------
94 /// The converter from APL input characters to APL tokens
95 class Tokenizer
96 {
97 public:
98    /// Constructor
Tokenizer(ParseMode pm,const char * _loc,bool mac)99    Tokenizer(ParseMode pm, const char * _loc, bool mac)
100    : pmode(pm),
101      macro(mac),
102      loc(_loc),
103      rest_1(0),
104      rest_2(0)
105    {}
106 
107    /// tokenize UTF-8 string \b input into token string \b tos.
108    ErrorCode tokenize(const UCS_string & input, Token_string & tos);
109 
110    /// tokenize a primitive (1-character) function
111    static Token tokenize_function(Unicode uni);
112 
113 protected:
114    /// tokenize UCS string \b input into token string \b tos.
115    void do_tokenize(const UCS_string & input, Token_string & tos);
116 
117    /// tokenize a function
118    void tokenize_function(Unicode_source & src, Token_string & tos);
119 
120    /// tokenize a Quad function or variable
121    void tokenize_quad(Unicode_source & src, Token_string & tos);
122 
123    /// tokenize a single quoted string
124    void tokenize_string1(Unicode_source & src, Token_string & tos);
125 
126    /// tokenize a double quoted string
127    void tokenize_string2(Unicode_source & src, Token_string & tos);
128 
129    /// tokenize a number (integer, floating point, or complex).
130    void tokenize_number(Unicode_source & src, Token_string & tos);
131 
132    /// tokenize a real number (integer or floating point).
133    bool tokenize_real(Unicode_source &src, bool & need_float,
134                       APL_Float & flt_val, APL_Integer & int_val);
135 
136    /// a locale-independent sscanf()
137    static int scan_real(const char * strg, APL_Float & result,
138                         int E_pos, int minus_pos);
139 
140    /// tokenize a symbol
141    void tokenize_symbol(Unicode_source & src, Token_string & tos);
142 
143    /// the parsing mode of this parser
144    const ParseMode pmode;
145 
146    /// tokenize macro code
147    const bool macro;
148 
149    /// caller of this Tokenizer
150    const char * loc;
151 
152    /// the characters afer caret 1
153    int rest_1;
154 
155    /// the characters afer caret 2
156    int rest_2;
157 };
158 
159 #endif // __TOKENIZER_HH_DEFINED__
160