1 /*
2     This file is part of GNU APL, a free implementation of the
3     ISO/IEC Standard 13751, "Programming Language APL, Extended"
4 
5     Copyright (C) 2008-2015  Dr. Jürgen Sauermann
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include <string.h>
22 
23 #include "Bif_F12_FORMAT.hh"
24 #include "Bif_F12_SORT.hh"
25 #include "Bif_F12_TAKE_DROP.hh"
26 #include "Bif_OPER1_COMMUTE.hh"
27 #include "Bif_OPER1_EACH.hh"
28 #include "Bif_OPER2_INNER.hh"
29 #include "Bif_OPER2_OUTER.hh"
30 #include "Bif_OPER2_POWER.hh"
31 #include "Bif_OPER2_RANK.hh"
32 #include "Bif_OPER1_REDUCE.hh"
33 #include "Bif_OPER1_SCAN.hh"
34 #include "CharCell.hh"
35 #include "Common.hh"
36 #include "ComplexCell.hh"
37 #include "FloatCell.hh"
38 #include "IntCell.hh"
39 #include "Output.hh"
40 #include "PointerCell.hh"
41 #include "Symbol.hh"
42 #include "SystemLimits.hh"
43 #include "SystemVariable.hh"
44 #include "Tokenizer.hh"
45 #include "Value.hh"
46 #include "Workspace.hh"
47 
48 //-----------------------------------------------------------------------------
operator <<(ostream & out,const Unicode_source & src)49 inline ostream & operator << (ostream & out, const Unicode_source & src)
50    { loop(s, src.rest())   out << src[s];   return out; }
51 //-----------------------------------------------------------------------------
52 /** convert \b UCS_string input into a Token_string tos.
53 */
54 ErrorCode
tokenize(const UCS_string & input,Token_string & tos)55 Tokenizer::tokenize(const UCS_string & input, Token_string & tos)
56 {
57    try
58       {
59         do_tokenize(input, tos);
60       }
61    catch (Error err)
62       {
63         const int caret_1 = input.size() - rest_1;
64         const int caret_2 = input.size() - rest_2;
65         err.set_error_line_2(input, caret_1, caret_2);
66         return err.get_error_code();
67       }
68 
69    return E_NO_ERROR;
70 }
71 //-----------------------------------------------------------------------------
72 /** convert \b UCS_string input into a Token_string tos.
73 */
74 void
do_tokenize(const UCS_string & input,Token_string & tos)75 Tokenizer::do_tokenize(const UCS_string & input, Token_string & tos)
76 {
77    Log(LOG_tokenize)
78       CERR << "tokenize: input[" << input.size() << "] is: «"
79            << input << "»" << endl;
80 
81 Unicode_source src(input);
82    while ((rest_1 = rest_2 = src.rest()) != 0)
83       {
84         Unicode uni = *src;
85         if (uni == UNI_COMMENT)             break;   // ⍝ comment
86         if (uni == UNI_ASCII_NUMBER_SIGN)   break;   // # comment
87 
88         const Token tok = Avec::uni_to_token(uni, LOC);
89 
90         Log(LOG_tokenize)
91            {
92              Unicode_source s1(src, 0, 24);
93              CERR << "  tokenize(" <<  src.rest() << " chars) sees [tag "
94                   << tok.tag_name() << " «" << uni << "»] " << s1;
95              if (src.rest() != s1.rest())   CERR << " ...";
96              CERR << endl;
97            }
98 
99         switch(tok.get_Class())
100             {
101               case TC_END:   // chars without APL meaning
102                    rest_2 = src.rest();
103                    {
104                      Log(LOG_error_throw)
105                      CERR << endl << "throwing "
106                           << Error::error_name(E_NO_TOKEN)
107                           << " in  Tokenizer" << endl;
108 
109                      char cc[20];
110                      snprintf(cc, sizeof(cc), "U+%4.4X (", uni);
111                      MORE_ERROR() << "Tokenizer: No token for Unicode "
112                                   <<  cc << uni << ")\nInput: " << input;
113                      Error error(E_NO_TOKEN, LOC);
114                      throw error;
115                    }
116                    break;
117 
118               case TC_RETURN:
119               case TC_LINE:
120               case TC_VALUE:
121               case TC_INDEX:
122                    CERR << "Offending token: " << tok.get_tag()
123                         << " (" << tok << ")" << endl;
124                    if (tok.get_tag() == TOK_CHARACTER)
125                       CERR << "Unicode: " << UNI(tok.get_char_val()) << endl;
126                    rest_2 = src.rest();
127                    Error::throw_parse_error(E_NON_APL_CHAR, LOC, loc);
128                    break;
129 
130               case TC_VOID:
131                    // Avec::uni_to_token returns TC_VOID for non-apl characters
132                    //
133                    rest_2 = src.rest();
134                    UERR << "Unknown APL character: " << uni
135                         << " (" << UNI(uni) << ")" << endl;
136                    Error::throw_parse_error(E_NON_APL_CHAR, LOC, loc);
137                    break;
138 
139               case TC_SYMBOL:
140                    if (Avec::is_quad(uni))
141                       {
142                          tokenize_quad(src, tos);
143                       }
144                    else if (uni == UNI_QUOTE_Quad)
145                       {
146                         ++src;
147                         tos.push_back(Token(TOK_Quad_QUOTE,
148                                          &Workspace::get_v_Quad_QUOTE()));
149                       }
150                    else if (uni == UNI_ALPHA)
151                       {
152                         ++src;
153                         tos.push_back(Token(TOK_ALPHA,
154                                          &Workspace::get_v_ALPHA()));
155                       }
156                    else if (uni == UNI_ALPHA_UNDERBAR)
157                       {
158                         ++src;
159                         tos.push_back(Token(TOK_ALPHA_U,
160                                             &Workspace::get_v_ALPHA_U()));
161                       }
162                    else if (uni == UNI_CHI)
163                       {
164                         ++src;
165                         tos.push_back(Token(TOK_CHI,
166                                             &Workspace::get_v_CHI()));
167                       }
168                    else if (uni == UNI_LAMBDA)
169                       {
170                         // this could be λ like in λ← ...
171                         // or λ1 or λ2 or ... as in ... ⍺ λ1 ⍵
172                         //
173                         if (src.rest() > 1 && Avec::is_digit(src[1]))   // λn
174                            {
175                              tokenize_symbol(src, tos);
176                            }
177                         else   // λ
178                            {
179                              ++src;
180                              tos.push_back(Token(TOK_LAMBDA,
181                                            &Workspace::get_v_LAMBDA()));
182                            }
183                       }
184                    else if (uni == UNI_OMEGA)
185                       {
186                         ++src;
187                         tos.push_back(Token(TOK_OMEGA,
188                                             &Workspace::get_v_OMEGA()));
189                       }
190                    else if (uni == UNI_OMEGA_UNDERBAR)
191                       {
192                         ++src;
193                         tos.push_back(Token(TOK_OMEGA_U,
194                                             &Workspace::get_v_OMEGA_U()));
195                       }
196                    else
197                       {
198                         tokenize_symbol(src, tos);
199                       }
200                    break;
201 
202               case TC_FUN0:
203               case TC_FUN12:
204               case TC_OPER1:
205                    tokenize_function(src, tos);
206                    break;
207 
208               case TC_OPER2:
209                    if (tok.get_tag() == TOK_OPER2_INNER && src.rest())
210                       {
211                         // tok is a dot. This could mean that . is either
212                         //
213                         // the start of a number:     e.g. +.3
214                         // or an operator:            e.g. +.*
215                         // or a syntax error:         e.g. Done.
216                         //
217                         if (src.rest() == 1)   // syntax error
218                            Error::throw_parse_error(E_SYNTAX_ERROR, LOC, loc);
219 
220                         Unicode uni_1 = src[1];
221                         const Token tok_1 = Avec::uni_to_token(uni_1, LOC);
222                         if ((tok_1.get_tag() & TC_MASK) == TC_NUMERIC)
223                            tokenize_number(src, tos);
224                         else
225                            tokenize_function(src, tos);
226                       }
227                    else
228                       {
229                         tokenize_function(src, tos);
230                       }
231                    if (tos.size() >= 2 &&
232                        tos.back().get_tag() == TOK_OPER2_INNER &&
233                        tos[tos.size() - 2].get_tag() == TOK_JOT)
234                       {
235                         new (&tos.back()) Token(TOK_OPER2_OUTER,
236                                                 Bif_OPER2_OUTER::fun);
237                       }
238 
239                    break;
240 
241               case TC_R_ARROW:
242                    ++src;
243                    if (src.rest())   tos.push_back(tok);
244                    else              tos.push_back(Token(TOK_ESCAPE));
245                    break;
246 
247               case TC_ASSIGN:
248                    ++src;
249                    {
250                      const bool sym = tos.size() >= 1 &&
251                                 tos[tos.size() - 1].get_tag() == TOK_SYMBOL;
252                      const bool dia = tos.size() > 1 &&
253                                 tos[tos.size() - 2].get_tag() == TOK_DIAMOND;
254                      const bool col = tos.size() > 1 &&
255                                 tos[tos.size() - 2].get_tag() == TOK_COLON;
256 
257                    tos.push_back(tok);
258 
259                    // change token tag of ← if:
260                    //
261                    //   SYM ←   (at the start of line),        or
262                    // ◊ SYM ←   (at the start of statement),   or
263                    // : SYM ←   (at the start of statement after label)
264                    //
265                    if (sym && ((tos.size() == 2) || dia || col))
266                       tos.back().ChangeTag(TOK_ASSIGN1);
267                    }
268                    break;
269 
270               case TC_L_PARENT:
271               case TC_R_PARENT:
272               case TC_L_BRACK:
273               case TC_R_BRACK:
274               case TC_L_CURLY:
275               case TC_R_CURLY:
276                    ++src;
277                    tos.push_back(tok);
278                    break;
279 
280               case TC_DIAMOND:
281                    ++src;
282                    tos.push_back(tok);
283                    break;
284 
285               case TC_COLON:
286                    if (pmode != PM_FUNCTION)
287                       {
288                         rest_2 = src.rest();
289                         if (pmode == PM_EXECUTE)
290                            Error::throw_parse_error(E_ILLEGAL_COLON_EXEC,
291                                                     LOC, loc);
292                         else
293                            Error::throw_parse_error(E_ILLEGAL_COLON_STAT,
294                                                     LOC, loc);
295                       }
296 
297                    ++src;
298                    tos.push_back(tok);
299                    break;
300 
301               case TC_NUMERIC:
302                    tokenize_number(src, tos);
303                    break;
304 
305               case TC_SPACE:
306               case TC_NEWLINE:
307                    ++src;
308                    break;
309 
310               case TC_QUOTE:
311                    if (tok.get_tag() == TOK_QUOTE1)
312                       tokenize_string1(src, tos);
313                    else
314                       tokenize_string2(src, tos);
315                    break;
316 
317               default:
318                    CERR << "Input: " << input << endl
319                         << "uni:   " << uni << endl
320                         << "Token = " << tok.get_tag() << endl;
321 
322                    if (tok.get_Id() != ID_No_ID)
323                       {
324                         CERR << ", Id = " << Id(tok.get_tag() >> 16);
325                       }
326                    CERR << endl;
327                    Assert(0 && "Should not happen");
328             }
329       }
330 
331    Log(LOG_tokenize)
332       CERR << "tokenize() done (no error)" << endl;
333 }
334 //-----------------------------------------------------------------------------
335 void
tokenize_function(Unicode_source & src,Token_string & tos)336 Tokenizer::tokenize_function(Unicode_source & src, Token_string & tos)
337 {
338    Log(LOG_tokenize)   CERR << "tokenize_function(" << src << ")" << endl;
339 
340 const Unicode uni = src.get();
341 const Token tok = tokenize_function(uni);
342    tos.push_back(tok);
343 }
344 //-----------------------------------------------------------------------------
345 Token
tokenize_function(Unicode uni)346 Tokenizer::tokenize_function(Unicode uni)
347 {
348 const Token tok = Avec::uni_to_token(uni, LOC);
349 
350 #define sys(t, f) \
351    case TOK_ ## t: return Token(tok.get_tag(), Bif_ ## f::fun);   break;
352 
353    switch(tok.get_tag())
354       {
355         sys(F0_ZILDE,      F0_ZILDE)
356         sys(F1_EXECUTE,    F1_EXECUTE)
357 
358         sys(F2_AND,        F2_AND)
359         sys(F2_EQUAL,      F2_EQUAL)
360         sys(F2_FIND,       F2_FIND)
361         sys(F2_GREATER,    F2_GREATER)
362         sys(F2_INDEX,      F2_INDEX)
363         sys(F2_LESS,       F2_LESS)
364         sys(F2_LEQ,        F2_LEQ)
365         sys(F2_MEQ,        F2_MEQ)
366         sys(F2_NAND,       F2_NAND)
367         sys(F2_NOR,        F2_NOR)
368         sys(F2_OR,         F2_OR)
369         sys(F2_UNEQ,       F2_UNEQ)
370 
371         sys(F12_BINOM,     F12_BINOM)
372         sys(F12_CIRCLE,    F12_CIRCLE)
373         sys(F12_COMMA,     F12_COMMA)
374         sys(F12_COMMA1,    F12_COMMA1)
375         sys(F12_DECODE,    F12_DECODE)
376         sys(F12_DIVIDE,    F12_DIVIDE)
377         sys(F12_DOMINO,    F12_DOMINO)
378         sys(F12_DROP,      F12_DROP)
379         sys(F12_ELEMENT,   F12_ELEMENT)
380         sys(F12_ENCODE,    F12_ENCODE)
381         sys(F12_EQUIV,     F12_EQUIV)
382         sys(F12_FORMAT,    F12_FORMAT)
383         sys(F12_INDEX_OF,  F12_INDEX_OF)
384         sys(F2_INTER,      F2_INTER)
385         sys(F2_LEFT,       F2_LEFT)
386         sys(F12_LOGA,      F12_LOGA)
387         sys(F12_MINUS,     F12_MINUS)
388         sys(F12_NEQUIV,    F12_NEQUIV)
389         sys(F12_PARTITION, F12_PARTITION)
390         sys(F12_PICK,      F12_PICK)
391         sys(F12_PLUS,      F12_PLUS)
392         sys(F12_POWER,     F12_POWER)
393         sys(F12_RHO,       F12_RHO)
394         sys(F2_RIGHT,      F2_RIGHT)
395         sys(F12_RND_DN,    F12_RND_DN)
396         sys(F12_RND_UP,    F12_RND_UP)
397         sys(F12_ROLL,      F12_ROLL)
398         sys(F12_ROTATE,    F12_ROTATE)
399         sys(F12_ROTATE1,   F12_ROTATE1)
400         sys(F12_SORT_ASC,  F12_SORT_ASC)
401         sys(F12_SORT_DES,  F12_SORT_DES)
402         sys(F12_STILE,     F12_STILE)
403         sys(F12_TAKE,      F12_TAKE)
404         sys(F12_TRANSPOSE, F12_TRANSPOSE)
405         sys(F12_TIMES,     F12_TIMES)
406         sys(F12_UNION,     F12_UNION)
407         sys(F12_WITHOUT,   F12_WITHOUT)
408 
409         sys(JOT,           JOT)
410 
411         sys(OPER1_COMMUTE, OPER1_COMMUTE)
412         sys(OPER1_EACH,    OPER1_EACH)
413         sys(OPER2_POWER,   OPER2_POWER)
414         sys(OPER2_RANK,    OPER2_RANK)
415         sys(OPER1_REDUCE,  OPER1_REDUCE)
416         sys(OPER1_REDUCE1, OPER1_REDUCE1)
417         sys(OPER1_SCAN,    OPER1_SCAN)
418         sys(OPER1_SCAN1,   OPER1_SCAN1)
419 
420         sys(OPER2_INNER,   OPER2_INNER)
421 
422         default: break;
423       }
424 
425    // CAUTION: cannot print entire token here because Avec::uni_to_token()
426    // inits the token tag but not any token pointers!
427    //
428    CERR << endl << "Token = " << tok.get_tag() << endl;
429    Assert(0 && "Missing Function");
430 
431 #undef sys
432    return tok;
433 }
434 //-----------------------------------------------------------------------------
435 void
tokenize_quad(Unicode_source & src,Token_string & tos)436 Tokenizer::tokenize_quad(Unicode_source & src, Token_string & tos)
437 {
438    Log(LOG_tokenize)
439       CERR << "tokenize_quad(" << src.rest() << " chars)"<< endl;
440 
441    src.get();               // discard (possibly alternative) ⎕
442 UCS_string ucs(UNI_Quad_Quad);
443    Assert(ucs[0]);
444 
445    if (src.rest() > 0)   ucs.append(src[0]);
446    if (src.rest() > 1)   ucs.append(src[1]);
447    if (src.rest() > 2)   ucs.append(src[2]);
448    if (src.rest() > 3)   ucs.append(src[3]);
449    if (src.rest() > 4)   ucs.append(src[4]);
450 
451 int len = 0;
452 const Token t = Workspace::get_quad(ucs, len);
453    src.skip(len - 1);
454    tos.push_back(t);
455 }
456 //-----------------------------------------------------------------------------
457 /** tokenize a single quoted string.
458  ** If the string is a single character, then we
459  **  return a TOK_CHARACTER. Otherwise we return TOK_APL_VALUE1.
460  **/
461 void
tokenize_string1(Unicode_source & src,Token_string & tos)462 Tokenizer::tokenize_string1(Unicode_source & src, Token_string & tos)
463 {
464    Log(LOG_tokenize)   CERR << "tokenize_string1(" << src << ")" << endl;
465 
466 const Unicode uni = src.get();
467    Assert(Avec::is_single_quote(uni));
468 
469 UCS_string string_value;
470 bool got_end = false;
471 
472    while (src.rest())
473        {
474          const Unicode uni = src.get();
475 
476          if (Avec::is_single_quote(uni))
477             {
478               // a single ' is the end of the string, while a double '
479               // (i.e. '') is a single '.
480               //
481               if ((src.rest() == 0) || !Avec::is_single_quote(*src))
482                  {
483                    got_end = true;
484                    break;
485                  }
486 
487               string_value.append(UNI_SINGLE_QUOTE);
488               ++src;      // skip the second '
489             }
490          else if (uni == UNI_ASCII_CR)
491             {
492               continue;
493             }
494          else if (uni == UNI_ASCII_LF)
495             {
496               rest_2 = src.rest();
497               Error::throw_parse_error(E_NO_STRING_END, LOC, loc);
498             }
499          else
500             {
501               string_value.append(uni);
502             }
503        }
504 
505    if (!got_end)   Error::throw_parse_error(E_NO_STRING_END, LOC, loc);
506 
507    if (string_value.size() == 1)   // scalar
508       {
509         tos.push_back(Token(TOK_CHARACTER, string_value[0]));
510       }
511    else
512       {
513         tos.push_back(Token(TOK_APL_VALUE1, Value_P(string_value, LOC)));
514       }
515 }
516 //-----------------------------------------------------------------------------
517 /** tokenize a double quoted string.
518  ** If the string is a single character, then we
519  **  return a TOK_CHARACTER. Otherwise we return TOK_APL_VALUE1.
520  **/
521 void
tokenize_string2(Unicode_source & src,Token_string & tos)522 Tokenizer::tokenize_string2(Unicode_source & src, Token_string & tos)
523 {
524    Log(LOG_tokenize)   CERR << "tokenize_string2(" << src << ")" << endl;
525 
526    // skip the leading "
527    {
528      const Unicode uni = src.get();
529      if (uni)   { /* do nothing, needed for -Wall */ }
530 
531      Assert1(uni == UNI_ASCII_DOUBLE_QUOTE);
532    }
533 
534 UCS_string string_value;
535 bool got_end = false;
536 
537    while (src.rest())
538        {
539          const Unicode uni = src.get();
540 
541          if (uni == UNI_ASCII_DOUBLE_QUOTE)     // terminating "
542             {
543               got_end = true;
544               break;
545             }
546          else if (uni == UNI_ASCII_CR)          // ignore CR
547             {
548               continue;
549             }
550          else if (uni == UNI_ASCII_LF)          // end of line before "
551             {
552               rest_2 = src.rest();
553               Error::throw_parse_error(E_NO_STRING_END, LOC, loc);
554             }
555          else if (uni == UNI_ASCII_BACKSLASH)   // backslash
556             {
557               const Unicode uni1 = src.get();
558               switch(uni1)
559                  {
560                    case '0':  string_value.append(UNI_ASCII_NUL);         break;
561                    case 'a':  string_value.append(UNI_ASCII_BEL);         break;
562                    case 'b':  string_value.append(UNI_ASCII_BS);          break;
563                    case 't':  string_value.append(UNI_ASCII_HT);          break;
564                    case 'n':  string_value.append(UNI_ASCII_LF);          break;
565                    case 'v':  string_value.append(UNI_ASCII_VT);          break;
566                    case 'f':  string_value.append(UNI_ASCII_FF);          break;
567                    case 'r':  string_value.append(UNI_ASCII_CR);          break;
568                    case '[':  string_value.append(UNI_ASCII_ESC);         break;
569                    case '"':  string_value.append(UNI_ASCII_DOUBLE_QUOTE);break;
570                    case '\\': string_value.append(UNI_ASCII_BACKSLASH);   break;
571                    default:   string_value.append(uni);
572                               string_value.append(uni1);
573                  }
574             }
575          else
576             {
577               string_value.append(uni);
578             }
579        }
580 
581    if (!got_end)   Error::throw_parse_error(E_NO_STRING_END, LOC, loc);
582 
583    else
584       {
585         tos.push_back(Token(TOK_APL_VALUE1, Value_P(string_value, LOC)));
586       }
587 }
588 //-----------------------------------------------------------------------------
589 void
tokenize_number(Unicode_source & src,Token_string & tos)590 Tokenizer::tokenize_number(Unicode_source & src, Token_string & tos)
591 {
592    Log(LOG_tokenize)   CERR << "tokenize_number(" << src << ")" << endl;
593 
594    // numbers:
595    // real
596    // real 'J' real
597    // real 'D' real   // magnitude + angle in degrees
598    // real 'R' real   // magnitude + angle in radian
599 
600 APL_Float   real_flt = 0.0;   // always valid
601 APL_Integer real_int = 0;     // valid if need_float is false
602 bool        real_need_float = false;
603 const bool real_valid = tokenize_real(src, real_need_float, real_flt, real_int);
604    if (!real_need_float)   real_flt = real_int;
605    if (!real_valid)
606       {
607         rest_2 = src.rest();
608         Error::throw_parse_error(E_BAD_NUMBER, LOC, loc);
609       }
610 
611    if (src.rest() && (*src == UNI_ASCII_J || *src == UNI_ASCII_j))
612       {
613         ++src;   // skip 'J'
614 
615         APL_Float   imag_flt = 0.0;   // always valid
616         APL_Integer imag_int = 0;     // valid if imag_need_float is false
617         bool imag_need_float = false;
618         const bool imag_valid = tokenize_real(src, imag_need_float,
619                                               imag_flt, imag_int);
620         if (!imag_need_float)   imag_flt = imag_int;
621 
622         if (!imag_valid)
623            {
624              --src;   // undo skip 'J'
625              if (real_need_float)
626                 {
627                   tos.push_back(Token(TOK_REAL,    real_flt));
628                   Log(LOG_tokenize)
629                      CERR << "  tokenize_number: real " << real_flt << endl;
630                 }
631              else
632                 {
633                   tos.push_back(Token(TOK_INTEGER, real_int));
634                   Log(LOG_tokenize)
635                      CERR << "  tokenize_number: integer " << real_int << endl;
636                 }
637              goto done;;
638            }
639 
640         tos.push_back(Token(TOK_COMPLEX, real_flt, imag_flt));
641         Log(LOG_tokenize)   CERR << "  tokenize_number: complex "
642                                  << real_flt << "J" << imag_flt << endl;
643       }
644    else if (src.rest() && (*src == UNI_ASCII_D || *src == UNI_ASCII_d))
645       {
646         ++src;   // skip 'D'
647 
648         APL_Float   degrees_flt = 0;  // always valid
649         APL_Integer degrees_int = 0;  // valid if imag_floating is true
650         bool imag_need_float = false;
651         const bool imag_valid = tokenize_real(src, imag_need_float,
652                                               degrees_flt, degrees_int);
653         if (!imag_need_float)   degrees_flt = degrees_int;
654 
655         if (!imag_valid)
656            {
657              --src;   // undo skip 'D'
658              if (real_need_float)
659                 {
660                   tos.push_back(Token(TOK_REAL,    real_flt));
661                   Log(LOG_tokenize)
662                      CERR << "  tokenize_number: real " << real_flt << endl;
663                 }
664              else
665                 {
666                   tos.push_back(Token(TOK_INTEGER, real_int));
667                   Log(LOG_tokenize)
668                      CERR << "  tokenize_number: integer " << real_int << endl;
669                 }
670              goto done;;
671            }
672 
673         // real_flt is the magnitude and the angle is in degrees.
674         //
675         APL_Float real = real_flt * cos(M_PI*degrees_flt / 180.0);
676         APL_Float imag = real_flt * sin(M_PI*degrees_flt / 180.0);
677         tos.push_back(Token(TOK_COMPLEX, real, imag));
678         Log(LOG_tokenize)   CERR << "  tokenize_number: complex " << real
679                                  << "J" << imag << endl;
680       }
681    else if (src.rest() && (*src == UNI_ASCII_R || *src == UNI_ASCII_r))
682       {
683         ++src;   // skip 'R'
684 
685         APL_Float radian_flt;     // always valid
686         APL_Integer radian_int;   // valid if imag_floating is true
687         bool radian_need_float = false;
688         const bool imag_valid = tokenize_real(src, radian_need_float,
689                                               radian_flt, radian_int);
690         if (!radian_need_float)   radian_flt = radian_int;
691 
692         if (!imag_valid)
693            {
694              --src;   // undo skip 'R'
695              if (real_need_float)
696                 {
697                   tos.push_back(Token(TOK_REAL,    real_flt));
698                   Log(LOG_tokenize)
699                      CERR << "  tokenize_number: real " << real_flt << endl;
700                 }
701              else
702                 {
703                   tos.push_back(Token(TOK_INTEGER, real_int));
704                   Log(LOG_tokenize)
705                      CERR << "  tokenize_number: integer " << real_int << endl;
706                 }
707              goto done;;
708            }
709 
710         // real_flt is the magnitude and the angle is in radian.
711         //
712         APL_Float real = real_flt * cos(radian_flt);
713         APL_Float imag = real_flt * sin(radian_flt);
714         tos.push_back(Token(TOK_COMPLEX, real, imag));
715         Log(LOG_tokenize)   CERR << "  tokenize_number: complex " << real
716                                  << "J" << imag << endl;
717       }
718    else
719       {
720         if (real_need_float)
721            {
722              tos.push_back(Token(TOK_REAL,    real_flt));
723              Log(LOG_tokenize)
724                 CERR << "  tokenize_number: real " << real_flt << endl;
725            }
726         else
727            {
728              tos.push_back(Token(TOK_INTEGER, real_int));
729              Log(LOG_tokenize)
730                 CERR << "  tokenize_number: integer " << real_int << endl;
731            }
732       }
733 
734 done:
735    // ISO 13751 requires a space between two numeric scalar literals (page 42),
736    // but not between a numeric scalar literal and an identifier.
737    //
738    // IBM APL2 requires a space in both cases. For example, 10Q10 is tokenized
739    // as  10Q 10
740    //
741    // We follow ISO. The second numeric literal cannot start with 0-9 because
742    // that would have been eaten by the first literal. Therefor the only cases
743    // remaining to be checked are a numeric scalar literal followed by ¯
744    // or by . (page 42)
745    //
746    if (src.rest())
747       {
748         if (*src == UNI_OVERBAR || *src == '.')
749            Error::throw_parse_error(E_BAD_NUMBER, LOC, loc);
750       }
751 }
752 //-----------------------------------------------------------------------------
753 
754 enum { MAX_TOKENIZE_DIGITS_1 = 20,                       // incl. rounding digit
755        MAX_TOKENIZE_DIGITS = MAX_TOKENIZE_DIGITS_1 - 1   // excl. rounding digit
756      };
757 
758 #define exp_0_9(x) x ## 0, x ## 1, x ## 2, x ## 3, x ## 4,  \
759                            x ## 5, x ## 6, x ## 7, x ## 8, x ## 9,
760 
761 static const long double expo_tab[309] =
762 {
763    exp_0_9(1E)   exp_0_9(1E1)  exp_0_9(1E2)  exp_0_9(1E3)  exp_0_9(1E4)
764    exp_0_9(1E5)  exp_0_9(1E6)  exp_0_9(1E7)  exp_0_9(1E8)  exp_0_9(1E9)
765    exp_0_9(1E10) exp_0_9(1E11) exp_0_9(1E12) exp_0_9(1E13) exp_0_9(1E14)
766    exp_0_9(1E15) exp_0_9(1E16) exp_0_9(1E17) exp_0_9(1E18) exp_0_9(1E19)
767    exp_0_9(1E20) exp_0_9(1E21) exp_0_9(1E22) exp_0_9(1E23) exp_0_9(1E24)
768    exp_0_9(1E25) exp_0_9(1E26) exp_0_9(1E27) exp_0_9(1E28) exp_0_9(1E29)
769    1E300, 1E301, 1E302, 1E303, 1E304, 1E305, 1E306, 1E307, 1E308
770 };
771 
772 static const long double nexpo_tab[310] =
773 {
774    exp_0_9(1E-)   exp_0_9(1E-1)  exp_0_9(1E-2)  exp_0_9(1E-3)  exp_0_9(1E-4)
775    exp_0_9(1E-5)  exp_0_9(1E-6)  exp_0_9(1E-7)  exp_0_9(1E-8)  exp_0_9(1E-9)
776    exp_0_9(1E-10) exp_0_9(1E-11) exp_0_9(1E-12) exp_0_9(1E-13) exp_0_9(1E-14)
777    exp_0_9(1E-15) exp_0_9(1E-16) exp_0_9(1E-17) exp_0_9(1E-18) exp_0_9(1E-19)
778    exp_0_9(1E-20) exp_0_9(1E-21) exp_0_9(1E-22) exp_0_9(1E-23) exp_0_9(1E-24)
779    exp_0_9(1E-25) exp_0_9(1E-26) exp_0_9(1E-27) exp_0_9(1E-28) exp_0_9(1E-29)
780    exp_0_9(1E-30)
781 };
782 
783 bool
tokenize_real(Unicode_source & src,bool & need_float,APL_Float & flt_val,APL_Integer & int_val)784 Tokenizer::tokenize_real(Unicode_source & src, bool & need_float,
785                          APL_Float & flt_val, APL_Integer & int_val)
786 {
787    int_val = 0;
788    need_float = false;
789 
790 UTF8_string int_digits;
791 UTF8_string fract_digits;
792 UTF8_string expo_digits;
793 bool negative = false;
794 bool expo_negative = false;
795 bool skipped_0 = false;
796 bool dot_seen = false;
797 
798    // hexadecimal ?
799    //
800    if (src.rest() > 1 && *src == UNI_ASCII_DOLLAR_SIGN)
801       {
802         src.get();   // skip $
803         if (!Avec::is_hex_digit(*src))   return false;   // no hex after $
804         while (src.rest())
805            {
806              int digit;
807              if      (*src <  UNI_ASCII_0)   break;
808              else if (*src <= UNI_ASCII_9)   digit = src.get() - '0';
809              else if (*src <  UNI_ASCII_A)   break;
810              else if (*src <= UNI_ASCII_F)   digit = 10 + src.get() - 'A';
811              else if (*src <  UNI_ASCII_a)   break;
812              else if (*src <= UNI_ASCII_f)   digit = 10 + src.get() - 'a';
813              else                            break;
814              int_val = int_val << 4 | digit;
815            }
816         flt_val = int_val;
817         return true;   // OK
818       }
819 
820    // 1. split src into integer, fractional, and exponent parts, removing:
821    // 1a. a leading sign of the integer part
822    // 1b. leading zeros of the integer part
823    // 1c. the . between the integer and fractional parts
824    // 1d. trailing zeros of the fractional part
825    // 1e. the E between the fractional and exponent parts
826    // 1f. a sign of the exponent part
827    //
828    if (src.rest() && *src == UNI_OVERBAR)   // 1a.
829       {
830         negative = true;
831         ++src;
832       }
833 
834    // 1b. skip leading zeros in integer part
835    //
836    while (src.rest() && *src == '0')   { ++src;   skipped_0 = true; }
837 
838    // integer part
839    //
840    while (src.rest() && Avec::is_digit(*src))   int_digits += src.get();
841 
842    // fractional part
843    //
844    if (src.rest() && *src == UNI_ASCII_FULLSTOP)   // fract part present
845       {
846         ++src;   // 1c. skip '.'
847         dot_seen = true;
848         while (src.rest() && Avec::is_digit(*src))
849            {
850              fract_digits += src.get();
851            }
852 
853         while (fract_digits.size() && fract_digits.back() == '0')   // 1d.
854            fract_digits.pop_back();
855       }
856 
857    // require at least one integer or fractional digit
858    //
859    if (int_digits.size() == 0    &&
860        fract_digits.size() == 0  &&
861        !skipped_0)   return false;   // error
862 
863    // exponent part (but could also be a name starting with E or e)
864    //
865    if (src.rest() >= 2 &&   // at least E and a digit or ¯
866        (*src == UNI_ASCII_E || *src == UNI_ASCII_e))   // maybe exponent
867       {
868         expo_negative = (src[1] == UNI_OVERBAR);
869         if (expo_negative   &&
870             src.rest() >= 3 &&
871             Avec::is_digit(src[2]))                    // E¯nnn
872            {
873              need_float = true;
874              ++src;                        // skip e/E
875              ++src;                        // skip ¯
876              while (src.rest() && Avec::is_digit(*src))
877                   expo_digits += src.get();
878            }
879         else if (Avec::is_digit(src[1]))               // Ennn
880            {
881              need_float = true;
882              ++src;                        // skip e/E
883              while (src.rest() && Avec::is_digit(*src))
884                   expo_digits += src.get();
885            }
886       }
887 
888    // second dot ?
889    if (dot_seen && src.rest() && *src == UNI_ASCII_FULLSTOP)
890       return false;   // error
891 
892 int expo = 0;
893    loop(d, expo_digits.size())   expo = 10*expo + (expo_digits[d] - '0');
894    if (expo_negative)   expo = - expo;
895    expo += int_digits.size();
896 
897 UTF8_string digits = int_digits;
898    digits.append_UTF8(fract_digits);
899 
900    // at this point, digits is the fractional part ff... of 0.ff... ×10⋆expo
901    // discard leading fractional 0s and adjust expo accordingly
902    //
903    while (digits.size() && digits[0] == '0')
904       {
905         digits.erase(0);   // discard '0'
906         --expo;
907       }
908 
909    // at this point, digits is the fractional part ff... of 0.ff... ×10⋆expo
910    // from 0.1000... to 0.9999... Discard digits beyond integer precision.
911    //
912    // The largest 64-bit integer is 0x7FFFFFFFFFFFFFFF = 9223372036854775807
913    // which has 19 decimal digits. Discard all after first 20 digits and
914    // round according to the last digit.
915    //
916    if (digits.size() > MAX_TOKENIZE_DIGITS_1)
917       digits.resize(MAX_TOKENIZE_DIGITS_1);
918 
919    if (digits.size() == MAX_TOKENIZE_DIGITS_1)
920       {
921         if (digits.round_0_1())   ++expo;
922       }
923 
924    // special cases: all digits 0 or very small number
925    //
926    if (digits.size() == 0)   return true;   // OK: all digits were 0
927    if (expo <= -307)         return true;   // OK: very small number
928 
929    Assert(digits.size() <= MAX_TOKENIZE_DIGITS);
930 
931    if (expo > MAX_TOKENIZE_DIGITS)   need_float = true;   /// large integer
932 
933    // special case: integer between 9223372036854775807 and 9999999999999999999
934    //
935    if (expo == MAX_TOKENIZE_DIGITS && digits[0] == '9')
936       {
937         // a uint64_t compare might overflow, so we compare the digit string
938         const char * maxint = "9223372036854775807";
939         loop(j, digits.size())
940             {
941                if (digits[j] < maxint[j])    break;
942                if (digits[j] == maxint[j])   continue;
943                need_float = true;
944                break;
945             }
946       }
947 
948    if (int(digits.size()) > expo)   need_float = true;
949 
950    if (need_float)
951       {
952         if (digits.size() > 17)   digits.resize(17);
953 
954        int64_t v = 0;
955         loop(j, digits.size())
956           {
957             v = 10*v + (digits[j] - '0');
958             --expo;
959           }
960 
961         if (expo > 0)
962            {
963              if (expo > 308)   return false;
964              if (negative)   flt_val = - v * expo_tab[expo];
965              else            flt_val =   v * expo_tab[expo];
966              return true;   // OK
967            }
968         else if (expo < 0)
969            {
970              if (negative)   flt_val = - v * nexpo_tab[-expo];
971              else            flt_val =   v * nexpo_tab[-expo];
972              return true;   // OK
973            }
974 
975         if (negative)   flt_val = - v;
976         else            flt_val =   v;
977         return true;   // OK
978       }
979    else
980       {
981         int_val = 0;
982         loop(j, digits.size())   int_val = 10*int_val + (digits[j] - '0');
983         if (negative)   int_val = - int_val;
984         flt_val = int_val;
985         return true;   // OK
986       }
987 }
988 //-----------------------------------------------------------------------------
989 void
tokenize_symbol(Unicode_source & src,Token_string & tos)990 Tokenizer::tokenize_symbol(Unicode_source & src, Token_string & tos)
991 {
992    Log(LOG_tokenize)   CERR << "tokenize_symbol() : " << src.rest() << endl;
993 
994 UCS_string symbol;
995    if (macro)
996       {
997         symbol.append(UNI_MUE);
998         symbol.append(UNI_ASCII_MINUS);
999       }
1000    symbol.append(src.get());
1001 
1002    while (src.rest())
1003        {
1004          const Unicode uni = *src;
1005          if (!Avec::is_symbol_char(uni))   break;
1006          symbol.append(uni);
1007          ++src;
1008        }
1009 
1010    if (symbol.size() > 2 && symbol[1] == UNI_DELTA  &&
1011        (symbol[0] == UNI_ASCII_S || symbol[0] == UNI_ASCII_T))
1012       {
1013         // S∆ or T∆
1014 
1015         while (src.rest() && *src <= UNI_ASCII_SPACE)   src.get();   // spaces
1016         UCS_string symbol1(symbol, 2, symbol.size() - 2);   // without S∆/T∆
1017         Value_P AB(symbol1, LOC);
1018         Function * ST = 0;
1019         if (symbol[0] == UNI_ASCII_S) ST = Quad_STOP::fun;
1020         else                          ST = Quad_TRACE::fun;
1021 
1022         const bool assigned = (src.rest() && *src == UNI_LEFT_ARROW);
1023         if (assigned)   // dyadic: AB ∆fun
1024            {
1025              src.get();                                // skip ←
1026              Log(LOG_tokenize)
1027                 CERR << "Stop/Trace assigned: " << symbol1 << endl;
1028              tos.push_back(Token(TOK_APL_VALUE1, AB));   // left argument of ST
1029              tos.push_back(Token(TOK_FUN2, ST));
1030            }
1031         else
1032            {
1033              Log(LOG_tokenize)
1034                 CERR << "Stop/Trace referenved: " << symbol1 << endl;
1035              tos.push_back(Token(TOK_FUN2, ST));
1036              tos.push_back(Token(TOK_APL_VALUE1, AB));   // right argument of ST
1037            }
1038 
1039         return;
1040       }
1041 
1042 Symbol * sym = Workspace::lookup_symbol(symbol);
1043    Assert(sym);
1044    tos.push_back(Token(TOK_SYMBOL, sym));
1045 }
1046 //-----------------------------------------------------------------------------
1047 
1048