1 /*
2 This file is part of GNU APL, a free implementation of the
3 ISO/IEC Standard 13751, "Programming Language APL, Extended"
4
5 Copyright (C) 2008-2015 Dr. Jürgen Sauermann
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <string.h>
22
23 #include "Bif_F12_FORMAT.hh"
24 #include "Bif_F12_SORT.hh"
25 #include "Bif_F12_TAKE_DROP.hh"
26 #include "Bif_OPER1_COMMUTE.hh"
27 #include "Bif_OPER1_EACH.hh"
28 #include "Bif_OPER2_INNER.hh"
29 #include "Bif_OPER2_OUTER.hh"
30 #include "Bif_OPER2_POWER.hh"
31 #include "Bif_OPER2_RANK.hh"
32 #include "Bif_OPER1_REDUCE.hh"
33 #include "Bif_OPER1_SCAN.hh"
34 #include "CharCell.hh"
35 #include "Common.hh"
36 #include "ComplexCell.hh"
37 #include "FloatCell.hh"
38 #include "IntCell.hh"
39 #include "Output.hh"
40 #include "PointerCell.hh"
41 #include "Symbol.hh"
42 #include "SystemLimits.hh"
43 #include "SystemVariable.hh"
44 #include "Tokenizer.hh"
45 #include "Value.hh"
46 #include "Workspace.hh"
47
48 //-----------------------------------------------------------------------------
operator <<(ostream & out,const Unicode_source & src)49 inline ostream & operator << (ostream & out, const Unicode_source & src)
50 { loop(s, src.rest()) out << src[s]; return out; }
51 //-----------------------------------------------------------------------------
52 /** convert \b UCS_string input into a Token_string tos.
53 */
54 ErrorCode
tokenize(const UCS_string & input,Token_string & tos)55 Tokenizer::tokenize(const UCS_string & input, Token_string & tos)
56 {
57 try
58 {
59 do_tokenize(input, tos);
60 }
61 catch (Error err)
62 {
63 const int caret_1 = input.size() - rest_1;
64 const int caret_2 = input.size() - rest_2;
65 err.set_error_line_2(input, caret_1, caret_2);
66 return err.get_error_code();
67 }
68
69 return E_NO_ERROR;
70 }
71 //-----------------------------------------------------------------------------
72 /** convert \b UCS_string input into a Token_string tos.
73 */
74 void
do_tokenize(const UCS_string & input,Token_string & tos)75 Tokenizer::do_tokenize(const UCS_string & input, Token_string & tos)
76 {
77 Log(LOG_tokenize)
78 CERR << "tokenize: input[" << input.size() << "] is: «"
79 << input << "»" << endl;
80
81 Unicode_source src(input);
82 while ((rest_1 = rest_2 = src.rest()) != 0)
83 {
84 Unicode uni = *src;
85 if (uni == UNI_COMMENT) break; // ⍝ comment
86 if (uni == UNI_ASCII_NUMBER_SIGN) break; // # comment
87
88 const Token tok = Avec::uni_to_token(uni, LOC);
89
90 Log(LOG_tokenize)
91 {
92 Unicode_source s1(src, 0, 24);
93 CERR << " tokenize(" << src.rest() << " chars) sees [tag "
94 << tok.tag_name() << " «" << uni << "»] " << s1;
95 if (src.rest() != s1.rest()) CERR << " ...";
96 CERR << endl;
97 }
98
99 switch(tok.get_Class())
100 {
101 case TC_END: // chars without APL meaning
102 rest_2 = src.rest();
103 {
104 Log(LOG_error_throw)
105 CERR << endl << "throwing "
106 << Error::error_name(E_NO_TOKEN)
107 << " in Tokenizer" << endl;
108
109 char cc[20];
110 snprintf(cc, sizeof(cc), "U+%4.4X (", uni);
111 MORE_ERROR() << "Tokenizer: No token for Unicode "
112 << cc << uni << ")\nInput: " << input;
113 Error error(E_NO_TOKEN, LOC);
114 throw error;
115 }
116 break;
117
118 case TC_RETURN:
119 case TC_LINE:
120 case TC_VALUE:
121 case TC_INDEX:
122 CERR << "Offending token: " << tok.get_tag()
123 << " (" << tok << ")" << endl;
124 if (tok.get_tag() == TOK_CHARACTER)
125 CERR << "Unicode: " << UNI(tok.get_char_val()) << endl;
126 rest_2 = src.rest();
127 Error::throw_parse_error(E_NON_APL_CHAR, LOC, loc);
128 break;
129
130 case TC_VOID:
131 // Avec::uni_to_token returns TC_VOID for non-apl characters
132 //
133 rest_2 = src.rest();
134 UERR << "Unknown APL character: " << uni
135 << " (" << UNI(uni) << ")" << endl;
136 Error::throw_parse_error(E_NON_APL_CHAR, LOC, loc);
137 break;
138
139 case TC_SYMBOL:
140 if (Avec::is_quad(uni))
141 {
142 tokenize_quad(src, tos);
143 }
144 else if (uni == UNI_QUOTE_Quad)
145 {
146 ++src;
147 tos.push_back(Token(TOK_Quad_QUOTE,
148 &Workspace::get_v_Quad_QUOTE()));
149 }
150 else if (uni == UNI_ALPHA)
151 {
152 ++src;
153 tos.push_back(Token(TOK_ALPHA,
154 &Workspace::get_v_ALPHA()));
155 }
156 else if (uni == UNI_ALPHA_UNDERBAR)
157 {
158 ++src;
159 tos.push_back(Token(TOK_ALPHA_U,
160 &Workspace::get_v_ALPHA_U()));
161 }
162 else if (uni == UNI_CHI)
163 {
164 ++src;
165 tos.push_back(Token(TOK_CHI,
166 &Workspace::get_v_CHI()));
167 }
168 else if (uni == UNI_LAMBDA)
169 {
170 // this could be λ like in λ← ...
171 // or λ1 or λ2 or ... as in ... ⍺ λ1 ⍵
172 //
173 if (src.rest() > 1 && Avec::is_digit(src[1])) // λn
174 {
175 tokenize_symbol(src, tos);
176 }
177 else // λ
178 {
179 ++src;
180 tos.push_back(Token(TOK_LAMBDA,
181 &Workspace::get_v_LAMBDA()));
182 }
183 }
184 else if (uni == UNI_OMEGA)
185 {
186 ++src;
187 tos.push_back(Token(TOK_OMEGA,
188 &Workspace::get_v_OMEGA()));
189 }
190 else if (uni == UNI_OMEGA_UNDERBAR)
191 {
192 ++src;
193 tos.push_back(Token(TOK_OMEGA_U,
194 &Workspace::get_v_OMEGA_U()));
195 }
196 else
197 {
198 tokenize_symbol(src, tos);
199 }
200 break;
201
202 case TC_FUN0:
203 case TC_FUN12:
204 case TC_OPER1:
205 tokenize_function(src, tos);
206 break;
207
208 case TC_OPER2:
209 if (tok.get_tag() == TOK_OPER2_INNER && src.rest())
210 {
211 // tok is a dot. This could mean that . is either
212 //
213 // the start of a number: e.g. +.3
214 // or an operator: e.g. +.*
215 // or a syntax error: e.g. Done.
216 //
217 if (src.rest() == 1) // syntax error
218 Error::throw_parse_error(E_SYNTAX_ERROR, LOC, loc);
219
220 Unicode uni_1 = src[1];
221 const Token tok_1 = Avec::uni_to_token(uni_1, LOC);
222 if ((tok_1.get_tag() & TC_MASK) == TC_NUMERIC)
223 tokenize_number(src, tos);
224 else
225 tokenize_function(src, tos);
226 }
227 else
228 {
229 tokenize_function(src, tos);
230 }
231 if (tos.size() >= 2 &&
232 tos.back().get_tag() == TOK_OPER2_INNER &&
233 tos[tos.size() - 2].get_tag() == TOK_JOT)
234 {
235 new (&tos.back()) Token(TOK_OPER2_OUTER,
236 Bif_OPER2_OUTER::fun);
237 }
238
239 break;
240
241 case TC_R_ARROW:
242 ++src;
243 if (src.rest()) tos.push_back(tok);
244 else tos.push_back(Token(TOK_ESCAPE));
245 break;
246
247 case TC_ASSIGN:
248 ++src;
249 {
250 const bool sym = tos.size() >= 1 &&
251 tos[tos.size() - 1].get_tag() == TOK_SYMBOL;
252 const bool dia = tos.size() > 1 &&
253 tos[tos.size() - 2].get_tag() == TOK_DIAMOND;
254 const bool col = tos.size() > 1 &&
255 tos[tos.size() - 2].get_tag() == TOK_COLON;
256
257 tos.push_back(tok);
258
259 // change token tag of ← if:
260 //
261 // SYM ← (at the start of line), or
262 // ◊ SYM ← (at the start of statement), or
263 // : SYM ← (at the start of statement after label)
264 //
265 if (sym && ((tos.size() == 2) || dia || col))
266 tos.back().ChangeTag(TOK_ASSIGN1);
267 }
268 break;
269
270 case TC_L_PARENT:
271 case TC_R_PARENT:
272 case TC_L_BRACK:
273 case TC_R_BRACK:
274 case TC_L_CURLY:
275 case TC_R_CURLY:
276 ++src;
277 tos.push_back(tok);
278 break;
279
280 case TC_DIAMOND:
281 ++src;
282 tos.push_back(tok);
283 break;
284
285 case TC_COLON:
286 if (pmode != PM_FUNCTION)
287 {
288 rest_2 = src.rest();
289 if (pmode == PM_EXECUTE)
290 Error::throw_parse_error(E_ILLEGAL_COLON_EXEC,
291 LOC, loc);
292 else
293 Error::throw_parse_error(E_ILLEGAL_COLON_STAT,
294 LOC, loc);
295 }
296
297 ++src;
298 tos.push_back(tok);
299 break;
300
301 case TC_NUMERIC:
302 tokenize_number(src, tos);
303 break;
304
305 case TC_SPACE:
306 case TC_NEWLINE:
307 ++src;
308 break;
309
310 case TC_QUOTE:
311 if (tok.get_tag() == TOK_QUOTE1)
312 tokenize_string1(src, tos);
313 else
314 tokenize_string2(src, tos);
315 break;
316
317 default:
318 CERR << "Input: " << input << endl
319 << "uni: " << uni << endl
320 << "Token = " << tok.get_tag() << endl;
321
322 if (tok.get_Id() != ID_No_ID)
323 {
324 CERR << ", Id = " << Id(tok.get_tag() >> 16);
325 }
326 CERR << endl;
327 Assert(0 && "Should not happen");
328 }
329 }
330
331 Log(LOG_tokenize)
332 CERR << "tokenize() done (no error)" << endl;
333 }
334 //-----------------------------------------------------------------------------
335 void
tokenize_function(Unicode_source & src,Token_string & tos)336 Tokenizer::tokenize_function(Unicode_source & src, Token_string & tos)
337 {
338 Log(LOG_tokenize) CERR << "tokenize_function(" << src << ")" << endl;
339
340 const Unicode uni = src.get();
341 const Token tok = tokenize_function(uni);
342 tos.push_back(tok);
343 }
344 //-----------------------------------------------------------------------------
345 Token
tokenize_function(Unicode uni)346 Tokenizer::tokenize_function(Unicode uni)
347 {
348 const Token tok = Avec::uni_to_token(uni, LOC);
349
350 #define sys(t, f) \
351 case TOK_ ## t: return Token(tok.get_tag(), Bif_ ## f::fun); break;
352
353 switch(tok.get_tag())
354 {
355 sys(F0_ZILDE, F0_ZILDE)
356 sys(F1_EXECUTE, F1_EXECUTE)
357
358 sys(F2_AND, F2_AND)
359 sys(F2_EQUAL, F2_EQUAL)
360 sys(F2_FIND, F2_FIND)
361 sys(F2_GREATER, F2_GREATER)
362 sys(F2_INDEX, F2_INDEX)
363 sys(F2_LESS, F2_LESS)
364 sys(F2_LEQ, F2_LEQ)
365 sys(F2_MEQ, F2_MEQ)
366 sys(F2_NAND, F2_NAND)
367 sys(F2_NOR, F2_NOR)
368 sys(F2_OR, F2_OR)
369 sys(F2_UNEQ, F2_UNEQ)
370
371 sys(F12_BINOM, F12_BINOM)
372 sys(F12_CIRCLE, F12_CIRCLE)
373 sys(F12_COMMA, F12_COMMA)
374 sys(F12_COMMA1, F12_COMMA1)
375 sys(F12_DECODE, F12_DECODE)
376 sys(F12_DIVIDE, F12_DIVIDE)
377 sys(F12_DOMINO, F12_DOMINO)
378 sys(F12_DROP, F12_DROP)
379 sys(F12_ELEMENT, F12_ELEMENT)
380 sys(F12_ENCODE, F12_ENCODE)
381 sys(F12_EQUIV, F12_EQUIV)
382 sys(F12_FORMAT, F12_FORMAT)
383 sys(F12_INDEX_OF, F12_INDEX_OF)
384 sys(F2_INTER, F2_INTER)
385 sys(F2_LEFT, F2_LEFT)
386 sys(F12_LOGA, F12_LOGA)
387 sys(F12_MINUS, F12_MINUS)
388 sys(F12_NEQUIV, F12_NEQUIV)
389 sys(F12_PARTITION, F12_PARTITION)
390 sys(F12_PICK, F12_PICK)
391 sys(F12_PLUS, F12_PLUS)
392 sys(F12_POWER, F12_POWER)
393 sys(F12_RHO, F12_RHO)
394 sys(F2_RIGHT, F2_RIGHT)
395 sys(F12_RND_DN, F12_RND_DN)
396 sys(F12_RND_UP, F12_RND_UP)
397 sys(F12_ROLL, F12_ROLL)
398 sys(F12_ROTATE, F12_ROTATE)
399 sys(F12_ROTATE1, F12_ROTATE1)
400 sys(F12_SORT_ASC, F12_SORT_ASC)
401 sys(F12_SORT_DES, F12_SORT_DES)
402 sys(F12_STILE, F12_STILE)
403 sys(F12_TAKE, F12_TAKE)
404 sys(F12_TRANSPOSE, F12_TRANSPOSE)
405 sys(F12_TIMES, F12_TIMES)
406 sys(F12_UNION, F12_UNION)
407 sys(F12_WITHOUT, F12_WITHOUT)
408
409 sys(JOT, JOT)
410
411 sys(OPER1_COMMUTE, OPER1_COMMUTE)
412 sys(OPER1_EACH, OPER1_EACH)
413 sys(OPER2_POWER, OPER2_POWER)
414 sys(OPER2_RANK, OPER2_RANK)
415 sys(OPER1_REDUCE, OPER1_REDUCE)
416 sys(OPER1_REDUCE1, OPER1_REDUCE1)
417 sys(OPER1_SCAN, OPER1_SCAN)
418 sys(OPER1_SCAN1, OPER1_SCAN1)
419
420 sys(OPER2_INNER, OPER2_INNER)
421
422 default: break;
423 }
424
425 // CAUTION: cannot print entire token here because Avec::uni_to_token()
426 // inits the token tag but not any token pointers!
427 //
428 CERR << endl << "Token = " << tok.get_tag() << endl;
429 Assert(0 && "Missing Function");
430
431 #undef sys
432 return tok;
433 }
434 //-----------------------------------------------------------------------------
435 void
tokenize_quad(Unicode_source & src,Token_string & tos)436 Tokenizer::tokenize_quad(Unicode_source & src, Token_string & tos)
437 {
438 Log(LOG_tokenize)
439 CERR << "tokenize_quad(" << src.rest() << " chars)"<< endl;
440
441 src.get(); // discard (possibly alternative) ⎕
442 UCS_string ucs(UNI_Quad_Quad);
443 Assert(ucs[0]);
444
445 if (src.rest() > 0) ucs.append(src[0]);
446 if (src.rest() > 1) ucs.append(src[1]);
447 if (src.rest() > 2) ucs.append(src[2]);
448 if (src.rest() > 3) ucs.append(src[3]);
449 if (src.rest() > 4) ucs.append(src[4]);
450
451 int len = 0;
452 const Token t = Workspace::get_quad(ucs, len);
453 src.skip(len - 1);
454 tos.push_back(t);
455 }
456 //-----------------------------------------------------------------------------
457 /** tokenize a single quoted string.
458 ** If the string is a single character, then we
459 ** return a TOK_CHARACTER. Otherwise we return TOK_APL_VALUE1.
460 **/
461 void
tokenize_string1(Unicode_source & src,Token_string & tos)462 Tokenizer::tokenize_string1(Unicode_source & src, Token_string & tos)
463 {
464 Log(LOG_tokenize) CERR << "tokenize_string1(" << src << ")" << endl;
465
466 const Unicode uni = src.get();
467 Assert(Avec::is_single_quote(uni));
468
469 UCS_string string_value;
470 bool got_end = false;
471
472 while (src.rest())
473 {
474 const Unicode uni = src.get();
475
476 if (Avec::is_single_quote(uni))
477 {
478 // a single ' is the end of the string, while a double '
479 // (i.e. '') is a single '.
480 //
481 if ((src.rest() == 0) || !Avec::is_single_quote(*src))
482 {
483 got_end = true;
484 break;
485 }
486
487 string_value.append(UNI_SINGLE_QUOTE);
488 ++src; // skip the second '
489 }
490 else if (uni == UNI_ASCII_CR)
491 {
492 continue;
493 }
494 else if (uni == UNI_ASCII_LF)
495 {
496 rest_2 = src.rest();
497 Error::throw_parse_error(E_NO_STRING_END, LOC, loc);
498 }
499 else
500 {
501 string_value.append(uni);
502 }
503 }
504
505 if (!got_end) Error::throw_parse_error(E_NO_STRING_END, LOC, loc);
506
507 if (string_value.size() == 1) // scalar
508 {
509 tos.push_back(Token(TOK_CHARACTER, string_value[0]));
510 }
511 else
512 {
513 tos.push_back(Token(TOK_APL_VALUE1, Value_P(string_value, LOC)));
514 }
515 }
516 //-----------------------------------------------------------------------------
517 /** tokenize a double quoted string.
518 ** If the string is a single character, then we
519 ** return a TOK_CHARACTER. Otherwise we return TOK_APL_VALUE1.
520 **/
521 void
tokenize_string2(Unicode_source & src,Token_string & tos)522 Tokenizer::tokenize_string2(Unicode_source & src, Token_string & tos)
523 {
524 Log(LOG_tokenize) CERR << "tokenize_string2(" << src << ")" << endl;
525
526 // skip the leading "
527 {
528 const Unicode uni = src.get();
529 if (uni) { /* do nothing, needed for -Wall */ }
530
531 Assert1(uni == UNI_ASCII_DOUBLE_QUOTE);
532 }
533
534 UCS_string string_value;
535 bool got_end = false;
536
537 while (src.rest())
538 {
539 const Unicode uni = src.get();
540
541 if (uni == UNI_ASCII_DOUBLE_QUOTE) // terminating "
542 {
543 got_end = true;
544 break;
545 }
546 else if (uni == UNI_ASCII_CR) // ignore CR
547 {
548 continue;
549 }
550 else if (uni == UNI_ASCII_LF) // end of line before "
551 {
552 rest_2 = src.rest();
553 Error::throw_parse_error(E_NO_STRING_END, LOC, loc);
554 }
555 else if (uni == UNI_ASCII_BACKSLASH) // backslash
556 {
557 const Unicode uni1 = src.get();
558 switch(uni1)
559 {
560 case '0': string_value.append(UNI_ASCII_NUL); break;
561 case 'a': string_value.append(UNI_ASCII_BEL); break;
562 case 'b': string_value.append(UNI_ASCII_BS); break;
563 case 't': string_value.append(UNI_ASCII_HT); break;
564 case 'n': string_value.append(UNI_ASCII_LF); break;
565 case 'v': string_value.append(UNI_ASCII_VT); break;
566 case 'f': string_value.append(UNI_ASCII_FF); break;
567 case 'r': string_value.append(UNI_ASCII_CR); break;
568 case '[': string_value.append(UNI_ASCII_ESC); break;
569 case '"': string_value.append(UNI_ASCII_DOUBLE_QUOTE);break;
570 case '\\': string_value.append(UNI_ASCII_BACKSLASH); break;
571 default: string_value.append(uni);
572 string_value.append(uni1);
573 }
574 }
575 else
576 {
577 string_value.append(uni);
578 }
579 }
580
581 if (!got_end) Error::throw_parse_error(E_NO_STRING_END, LOC, loc);
582
583 else
584 {
585 tos.push_back(Token(TOK_APL_VALUE1, Value_P(string_value, LOC)));
586 }
587 }
588 //-----------------------------------------------------------------------------
589 void
tokenize_number(Unicode_source & src,Token_string & tos)590 Tokenizer::tokenize_number(Unicode_source & src, Token_string & tos)
591 {
592 Log(LOG_tokenize) CERR << "tokenize_number(" << src << ")" << endl;
593
594 // numbers:
595 // real
596 // real 'J' real
597 // real 'D' real // magnitude + angle in degrees
598 // real 'R' real // magnitude + angle in radian
599
600 APL_Float real_flt = 0.0; // always valid
601 APL_Integer real_int = 0; // valid if need_float is false
602 bool real_need_float = false;
603 const bool real_valid = tokenize_real(src, real_need_float, real_flt, real_int);
604 if (!real_need_float) real_flt = real_int;
605 if (!real_valid)
606 {
607 rest_2 = src.rest();
608 Error::throw_parse_error(E_BAD_NUMBER, LOC, loc);
609 }
610
611 if (src.rest() && (*src == UNI_ASCII_J || *src == UNI_ASCII_j))
612 {
613 ++src; // skip 'J'
614
615 APL_Float imag_flt = 0.0; // always valid
616 APL_Integer imag_int = 0; // valid if imag_need_float is false
617 bool imag_need_float = false;
618 const bool imag_valid = tokenize_real(src, imag_need_float,
619 imag_flt, imag_int);
620 if (!imag_need_float) imag_flt = imag_int;
621
622 if (!imag_valid)
623 {
624 --src; // undo skip 'J'
625 if (real_need_float)
626 {
627 tos.push_back(Token(TOK_REAL, real_flt));
628 Log(LOG_tokenize)
629 CERR << " tokenize_number: real " << real_flt << endl;
630 }
631 else
632 {
633 tos.push_back(Token(TOK_INTEGER, real_int));
634 Log(LOG_tokenize)
635 CERR << " tokenize_number: integer " << real_int << endl;
636 }
637 goto done;;
638 }
639
640 tos.push_back(Token(TOK_COMPLEX, real_flt, imag_flt));
641 Log(LOG_tokenize) CERR << " tokenize_number: complex "
642 << real_flt << "J" << imag_flt << endl;
643 }
644 else if (src.rest() && (*src == UNI_ASCII_D || *src == UNI_ASCII_d))
645 {
646 ++src; // skip 'D'
647
648 APL_Float degrees_flt = 0; // always valid
649 APL_Integer degrees_int = 0; // valid if imag_floating is true
650 bool imag_need_float = false;
651 const bool imag_valid = tokenize_real(src, imag_need_float,
652 degrees_flt, degrees_int);
653 if (!imag_need_float) degrees_flt = degrees_int;
654
655 if (!imag_valid)
656 {
657 --src; // undo skip 'D'
658 if (real_need_float)
659 {
660 tos.push_back(Token(TOK_REAL, real_flt));
661 Log(LOG_tokenize)
662 CERR << " tokenize_number: real " << real_flt << endl;
663 }
664 else
665 {
666 tos.push_back(Token(TOK_INTEGER, real_int));
667 Log(LOG_tokenize)
668 CERR << " tokenize_number: integer " << real_int << endl;
669 }
670 goto done;;
671 }
672
673 // real_flt is the magnitude and the angle is in degrees.
674 //
675 APL_Float real = real_flt * cos(M_PI*degrees_flt / 180.0);
676 APL_Float imag = real_flt * sin(M_PI*degrees_flt / 180.0);
677 tos.push_back(Token(TOK_COMPLEX, real, imag));
678 Log(LOG_tokenize) CERR << " tokenize_number: complex " << real
679 << "J" << imag << endl;
680 }
681 else if (src.rest() && (*src == UNI_ASCII_R || *src == UNI_ASCII_r))
682 {
683 ++src; // skip 'R'
684
685 APL_Float radian_flt; // always valid
686 APL_Integer radian_int; // valid if imag_floating is true
687 bool radian_need_float = false;
688 const bool imag_valid = tokenize_real(src, radian_need_float,
689 radian_flt, radian_int);
690 if (!radian_need_float) radian_flt = radian_int;
691
692 if (!imag_valid)
693 {
694 --src; // undo skip 'R'
695 if (real_need_float)
696 {
697 tos.push_back(Token(TOK_REAL, real_flt));
698 Log(LOG_tokenize)
699 CERR << " tokenize_number: real " << real_flt << endl;
700 }
701 else
702 {
703 tos.push_back(Token(TOK_INTEGER, real_int));
704 Log(LOG_tokenize)
705 CERR << " tokenize_number: integer " << real_int << endl;
706 }
707 goto done;;
708 }
709
710 // real_flt is the magnitude and the angle is in radian.
711 //
712 APL_Float real = real_flt * cos(radian_flt);
713 APL_Float imag = real_flt * sin(radian_flt);
714 tos.push_back(Token(TOK_COMPLEX, real, imag));
715 Log(LOG_tokenize) CERR << " tokenize_number: complex " << real
716 << "J" << imag << endl;
717 }
718 else
719 {
720 if (real_need_float)
721 {
722 tos.push_back(Token(TOK_REAL, real_flt));
723 Log(LOG_tokenize)
724 CERR << " tokenize_number: real " << real_flt << endl;
725 }
726 else
727 {
728 tos.push_back(Token(TOK_INTEGER, real_int));
729 Log(LOG_tokenize)
730 CERR << " tokenize_number: integer " << real_int << endl;
731 }
732 }
733
734 done:
735 // ISO 13751 requires a space between two numeric scalar literals (page 42),
736 // but not between a numeric scalar literal and an identifier.
737 //
738 // IBM APL2 requires a space in both cases. For example, 10Q10 is tokenized
739 // as 10Q 10
740 //
741 // We follow ISO. The second numeric literal cannot start with 0-9 because
742 // that would have been eaten by the first literal. Therefor the only cases
743 // remaining to be checked are a numeric scalar literal followed by ¯
744 // or by . (page 42)
745 //
746 if (src.rest())
747 {
748 if (*src == UNI_OVERBAR || *src == '.')
749 Error::throw_parse_error(E_BAD_NUMBER, LOC, loc);
750 }
751 }
752 //-----------------------------------------------------------------------------
753
754 enum { MAX_TOKENIZE_DIGITS_1 = 20, // incl. rounding digit
755 MAX_TOKENIZE_DIGITS = MAX_TOKENIZE_DIGITS_1 - 1 // excl. rounding digit
756 };
757
758 #define exp_0_9(x) x ## 0, x ## 1, x ## 2, x ## 3, x ## 4, \
759 x ## 5, x ## 6, x ## 7, x ## 8, x ## 9,
760
761 static const long double expo_tab[309] =
762 {
763 exp_0_9(1E) exp_0_9(1E1) exp_0_9(1E2) exp_0_9(1E3) exp_0_9(1E4)
764 exp_0_9(1E5) exp_0_9(1E6) exp_0_9(1E7) exp_0_9(1E8) exp_0_9(1E9)
765 exp_0_9(1E10) exp_0_9(1E11) exp_0_9(1E12) exp_0_9(1E13) exp_0_9(1E14)
766 exp_0_9(1E15) exp_0_9(1E16) exp_0_9(1E17) exp_0_9(1E18) exp_0_9(1E19)
767 exp_0_9(1E20) exp_0_9(1E21) exp_0_9(1E22) exp_0_9(1E23) exp_0_9(1E24)
768 exp_0_9(1E25) exp_0_9(1E26) exp_0_9(1E27) exp_0_9(1E28) exp_0_9(1E29)
769 1E300, 1E301, 1E302, 1E303, 1E304, 1E305, 1E306, 1E307, 1E308
770 };
771
772 static const long double nexpo_tab[310] =
773 {
774 exp_0_9(1E-) exp_0_9(1E-1) exp_0_9(1E-2) exp_0_9(1E-3) exp_0_9(1E-4)
775 exp_0_9(1E-5) exp_0_9(1E-6) exp_0_9(1E-7) exp_0_9(1E-8) exp_0_9(1E-9)
776 exp_0_9(1E-10) exp_0_9(1E-11) exp_0_9(1E-12) exp_0_9(1E-13) exp_0_9(1E-14)
777 exp_0_9(1E-15) exp_0_9(1E-16) exp_0_9(1E-17) exp_0_9(1E-18) exp_0_9(1E-19)
778 exp_0_9(1E-20) exp_0_9(1E-21) exp_0_9(1E-22) exp_0_9(1E-23) exp_0_9(1E-24)
779 exp_0_9(1E-25) exp_0_9(1E-26) exp_0_9(1E-27) exp_0_9(1E-28) exp_0_9(1E-29)
780 exp_0_9(1E-30)
781 };
782
783 bool
tokenize_real(Unicode_source & src,bool & need_float,APL_Float & flt_val,APL_Integer & int_val)784 Tokenizer::tokenize_real(Unicode_source & src, bool & need_float,
785 APL_Float & flt_val, APL_Integer & int_val)
786 {
787 int_val = 0;
788 need_float = false;
789
790 UTF8_string int_digits;
791 UTF8_string fract_digits;
792 UTF8_string expo_digits;
793 bool negative = false;
794 bool expo_negative = false;
795 bool skipped_0 = false;
796 bool dot_seen = false;
797
798 // hexadecimal ?
799 //
800 if (src.rest() > 1 && *src == UNI_ASCII_DOLLAR_SIGN)
801 {
802 src.get(); // skip $
803 if (!Avec::is_hex_digit(*src)) return false; // no hex after $
804 while (src.rest())
805 {
806 int digit;
807 if (*src < UNI_ASCII_0) break;
808 else if (*src <= UNI_ASCII_9) digit = src.get() - '0';
809 else if (*src < UNI_ASCII_A) break;
810 else if (*src <= UNI_ASCII_F) digit = 10 + src.get() - 'A';
811 else if (*src < UNI_ASCII_a) break;
812 else if (*src <= UNI_ASCII_f) digit = 10 + src.get() - 'a';
813 else break;
814 int_val = int_val << 4 | digit;
815 }
816 flt_val = int_val;
817 return true; // OK
818 }
819
820 // 1. split src into integer, fractional, and exponent parts, removing:
821 // 1a. a leading sign of the integer part
822 // 1b. leading zeros of the integer part
823 // 1c. the . between the integer and fractional parts
824 // 1d. trailing zeros of the fractional part
825 // 1e. the E between the fractional and exponent parts
826 // 1f. a sign of the exponent part
827 //
828 if (src.rest() && *src == UNI_OVERBAR) // 1a.
829 {
830 negative = true;
831 ++src;
832 }
833
834 // 1b. skip leading zeros in integer part
835 //
836 while (src.rest() && *src == '0') { ++src; skipped_0 = true; }
837
838 // integer part
839 //
840 while (src.rest() && Avec::is_digit(*src)) int_digits += src.get();
841
842 // fractional part
843 //
844 if (src.rest() && *src == UNI_ASCII_FULLSTOP) // fract part present
845 {
846 ++src; // 1c. skip '.'
847 dot_seen = true;
848 while (src.rest() && Avec::is_digit(*src))
849 {
850 fract_digits += src.get();
851 }
852
853 while (fract_digits.size() && fract_digits.back() == '0') // 1d.
854 fract_digits.pop_back();
855 }
856
857 // require at least one integer or fractional digit
858 //
859 if (int_digits.size() == 0 &&
860 fract_digits.size() == 0 &&
861 !skipped_0) return false; // error
862
863 // exponent part (but could also be a name starting with E or e)
864 //
865 if (src.rest() >= 2 && // at least E and a digit or ¯
866 (*src == UNI_ASCII_E || *src == UNI_ASCII_e)) // maybe exponent
867 {
868 expo_negative = (src[1] == UNI_OVERBAR);
869 if (expo_negative &&
870 src.rest() >= 3 &&
871 Avec::is_digit(src[2])) // E¯nnn
872 {
873 need_float = true;
874 ++src; // skip e/E
875 ++src; // skip ¯
876 while (src.rest() && Avec::is_digit(*src))
877 expo_digits += src.get();
878 }
879 else if (Avec::is_digit(src[1])) // Ennn
880 {
881 need_float = true;
882 ++src; // skip e/E
883 while (src.rest() && Avec::is_digit(*src))
884 expo_digits += src.get();
885 }
886 }
887
888 // second dot ?
889 if (dot_seen && src.rest() && *src == UNI_ASCII_FULLSTOP)
890 return false; // error
891
892 int expo = 0;
893 loop(d, expo_digits.size()) expo = 10*expo + (expo_digits[d] - '0');
894 if (expo_negative) expo = - expo;
895 expo += int_digits.size();
896
897 UTF8_string digits = int_digits;
898 digits.append_UTF8(fract_digits);
899
900 // at this point, digits is the fractional part ff... of 0.ff... ×10⋆expo
901 // discard leading fractional 0s and adjust expo accordingly
902 //
903 while (digits.size() && digits[0] == '0')
904 {
905 digits.erase(0); // discard '0'
906 --expo;
907 }
908
909 // at this point, digits is the fractional part ff... of 0.ff... ×10⋆expo
910 // from 0.1000... to 0.9999... Discard digits beyond integer precision.
911 //
912 // The largest 64-bit integer is 0x7FFFFFFFFFFFFFFF = 9223372036854775807
913 // which has 19 decimal digits. Discard all after first 20 digits and
914 // round according to the last digit.
915 //
916 if (digits.size() > MAX_TOKENIZE_DIGITS_1)
917 digits.resize(MAX_TOKENIZE_DIGITS_1);
918
919 if (digits.size() == MAX_TOKENIZE_DIGITS_1)
920 {
921 if (digits.round_0_1()) ++expo;
922 }
923
924 // special cases: all digits 0 or very small number
925 //
926 if (digits.size() == 0) return true; // OK: all digits were 0
927 if (expo <= -307) return true; // OK: very small number
928
929 Assert(digits.size() <= MAX_TOKENIZE_DIGITS);
930
931 if (expo > MAX_TOKENIZE_DIGITS) need_float = true; /// large integer
932
933 // special case: integer between 9223372036854775807 and 9999999999999999999
934 //
935 if (expo == MAX_TOKENIZE_DIGITS && digits[0] == '9')
936 {
937 // a uint64_t compare might overflow, so we compare the digit string
938 const char * maxint = "9223372036854775807";
939 loop(j, digits.size())
940 {
941 if (digits[j] < maxint[j]) break;
942 if (digits[j] == maxint[j]) continue;
943 need_float = true;
944 break;
945 }
946 }
947
948 if (int(digits.size()) > expo) need_float = true;
949
950 if (need_float)
951 {
952 if (digits.size() > 17) digits.resize(17);
953
954 int64_t v = 0;
955 loop(j, digits.size())
956 {
957 v = 10*v + (digits[j] - '0');
958 --expo;
959 }
960
961 if (expo > 0)
962 {
963 if (expo > 308) return false;
964 if (negative) flt_val = - v * expo_tab[expo];
965 else flt_val = v * expo_tab[expo];
966 return true; // OK
967 }
968 else if (expo < 0)
969 {
970 if (negative) flt_val = - v * nexpo_tab[-expo];
971 else flt_val = v * nexpo_tab[-expo];
972 return true; // OK
973 }
974
975 if (negative) flt_val = - v;
976 else flt_val = v;
977 return true; // OK
978 }
979 else
980 {
981 int_val = 0;
982 loop(j, digits.size()) int_val = 10*int_val + (digits[j] - '0');
983 if (negative) int_val = - int_val;
984 flt_val = int_val;
985 return true; // OK
986 }
987 }
988 //-----------------------------------------------------------------------------
989 void
tokenize_symbol(Unicode_source & src,Token_string & tos)990 Tokenizer::tokenize_symbol(Unicode_source & src, Token_string & tos)
991 {
992 Log(LOG_tokenize) CERR << "tokenize_symbol() : " << src.rest() << endl;
993
994 UCS_string symbol;
995 if (macro)
996 {
997 symbol.append(UNI_MUE);
998 symbol.append(UNI_ASCII_MINUS);
999 }
1000 symbol.append(src.get());
1001
1002 while (src.rest())
1003 {
1004 const Unicode uni = *src;
1005 if (!Avec::is_symbol_char(uni)) break;
1006 symbol.append(uni);
1007 ++src;
1008 }
1009
1010 if (symbol.size() > 2 && symbol[1] == UNI_DELTA &&
1011 (symbol[0] == UNI_ASCII_S || symbol[0] == UNI_ASCII_T))
1012 {
1013 // S∆ or T∆
1014
1015 while (src.rest() && *src <= UNI_ASCII_SPACE) src.get(); // spaces
1016 UCS_string symbol1(symbol, 2, symbol.size() - 2); // without S∆/T∆
1017 Value_P AB(symbol1, LOC);
1018 Function * ST = 0;
1019 if (symbol[0] == UNI_ASCII_S) ST = Quad_STOP::fun;
1020 else ST = Quad_TRACE::fun;
1021
1022 const bool assigned = (src.rest() && *src == UNI_LEFT_ARROW);
1023 if (assigned) // dyadic: AB ∆fun
1024 {
1025 src.get(); // skip ←
1026 Log(LOG_tokenize)
1027 CERR << "Stop/Trace assigned: " << symbol1 << endl;
1028 tos.push_back(Token(TOK_APL_VALUE1, AB)); // left argument of ST
1029 tos.push_back(Token(TOK_FUN2, ST));
1030 }
1031 else
1032 {
1033 Log(LOG_tokenize)
1034 CERR << "Stop/Trace referenved: " << symbol1 << endl;
1035 tos.push_back(Token(TOK_FUN2, ST));
1036 tos.push_back(Token(TOK_APL_VALUE1, AB)); // right argument of ST
1037 }
1038
1039 return;
1040 }
1041
1042 Symbol * sym = Workspace::lookup_symbol(symbol);
1043 Assert(sym);
1044 tos.push_back(Token(TOK_SYMBOL, sym));
1045 }
1046 //-----------------------------------------------------------------------------
1047
1048