1 /*
2     This file is part of GNU APL, a free implementation of the
3     ISO/IEC Standard 13751, "Programming Language APL, Extended"
4 
5     Copyright (C) 2008-2015  Dr. Jürgen Sauermann
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef __AVEC_HH_DEFINED__
22 #define __AVEC_HH_DEFINED__
23 
24 #include "Common.hh"
25 
26 class Token;
27 
28 
29 /// The valid indices the Atomic Vector of the APL interpreter
30 enum CHT_Index
31 {
32    Invalid_CHT = -1,
33 #define char_def( n, _u, _t, _f, _p) AV_ ## n,
34 #define char_df1(_n, _u, _t, _f, _p)
35 #include "Avec.def"
36    MAX_AV,
37 };
38 
39 ///  Some flags helping to classify characters
40 enum CharacterFlag
41 {
42    FLG_NONE            = 0x0000,   ///< no flag
43    FLG_SYMBOL          = 0x0001,   ///< valid char in a user defined name
44    FLG_DIGIT           = 0x0002,   ///< 0-9
45    FLG_NO_SPACE_AFTER  = 0x0004,   ///< never need a space after this char
46    FLG_NO_SPACE_BEFORE = 0x0008,   ///< never need a space before this char
47 
48    FLG_NO_SPACE        = FLG_NO_SPACE_AFTER | FLG_NO_SPACE_BEFORE,
49    FLG_NUMERIC         = FLG_DIGIT | FLG_SYMBOL,   ///< 0-9, A-Z, a-z, ∆, ⍙
50 };
51 
52 /**
53     class Avec is a collection of static functions related to the Atomic
54     Vector of the APL interpreter
55  */
56 /// Static helper  functions related to ⎕AV
57 class Avec
58 {
59 public:
60    /// init the static tables of this class and check them
61    static void init();
62 
63    /// Return the UNICODE of char table entry \b av
64    static Unicode unicode(CHT_Index av);
65 
66    /// Return the UNICODE of char table entry \b av
67    static uint32_t get_av_pos(CHT_Index av);
68 
69    /// Return a token containing \b av
70    static Token uni_to_token(Unicode & av, const char * loc);
71 
72    /// Return \b true iff \b av is a valid char in a user defined symbol
73    static bool is_symbol_char(Unicode av);
74 
75    /// return \b true iff \b av is a whitespace char (ASCII 0..32 (including))
is_white(Unicode av)76    static bool is_white(Unicode av)
77       { return av >= 0 && av <= ' '; }
78 
79    /// return \b true iff \b av is one of the single quote characters ' ‘ or ’
is_single_quote(Unicode av)80    static bool is_single_quote(Unicode av)
81       { return av == UNI_SINGLE_QUOTE  ||
82                av == UNI_SINGLE_QUOTE1 ||
83                av == UNI_SINGLE_QUOTE2; }
84 
85    /// return \b true iff \b av is one of the various diamond characters
is_diamond(Unicode av)86    static bool is_diamond(Unicode av)
87       { return av == UNI_DIAMOND || av == 0x22C4 || av == 0x2662 ||
88                av == 0x2B25      || av == 0x2B26      || av == 0x2B27; }
89 
90    /// return \b true iff \b av is a control char (ASCII 0..32 (excluding))
is_control(Unicode av)91    static bool is_control(Unicode av)
92       { return av >= 0 && av < ' '; }
93 
94    /// Return \b true iff \b av is a valid char in a user defined symbol
95    static bool is_first_symbol_char(Unicode uni);
96 
97    /// Return \b true iff \b av is a digit (i.e. 0 ≤ av ≤ 9)
is_digit(Unicode uni)98    static bool is_digit(Unicode uni)
99       { return (uni <= UNI_ASCII_9 && uni >= UNI_ASCII_0); }
100 
101    /// Return \b true iff \b av is a digit (i.e. 0 ≤ av ≤ 9)
is_hex_digit(Unicode uni)102    static bool is_hex_digit(Unicode uni)
103       { return is_digit(uni)
104             || (uni <= UNI_ASCII_F && uni >= UNI_ASCII_A)
105             || (uni <= UNI_ASCII_f && uni >= UNI_ASCII_a); }
106 
107    /// Return \b true iff \b av is a digit or a space
is_digit_or_space(Unicode uni)108    static bool is_digit_or_space(Unicode uni)
109       { return is_digit(uni) || is_white(uni); }
110 
111    /// return \b true iff \b av is a number char (digit, .)
is_number(Unicode uni)112    static bool is_number(Unicode uni)
113       { return is_digit(uni) || (uni == UNI_OVERBAR); }
114 
115    /// return true if unicode \b is defined by a char_def() or char_df1() macro
116    static bool is_known_char(Unicode uni);
117 
118    /// return true if unicode \b uni is a quad (⎕ or ▯)
is_quad(Unicode uni)119    static bool is_quad(Unicode uni)
120       { return uni == UNI_Quad_Quad || uni == UNI_Quad_Quad1; }
121 
122    /// return true if unicode \b uni needs ⎕UCS in 2 ⎕TF or )OUT
123    static bool need_UCS(Unicode uni);
124 
125    /// return \b true iff a token printed after \b av never needs a space
126    static bool no_space_after(Unicode uni);
127 
128    /// return \b true iff a token printed before \b av never needs a space
129    static bool no_space_before(Unicode av);
130 
131    /// return the subscript char for digit i
132    static Unicode subscript(uint32_t i);
133 
134    /// return the superscript char for digit i
135    static Unicode superscript(uint32_t i);
136 
137    /// Find \b av in \b character_table. Return AV position if found or MAX_AV
138    static uint32_t find_av_pos(Unicode av);
139 
140    /// Find \b av in \b character_table. Return position or Invalid_CHT
141    static CHT_Index find_char(Unicode av);
142 
143    /// Find \b return position of \b alt_av, or Invalid_AV if not found
144    static CHT_Index map_alternative_char(Unicode alt_av);
145 
146    /// a pointer to 256 Unicode characters that are exactly the APL2 character
147    /// set (⎕AV) shown in lrm Appendix A page 470. The ⎕AV of GNU APL is
148    /// similar, but contains characters like ≢ that are not in IBM's ⎕AV
149    /// IBM's ⎕AV is used in the )IN command
150    static const Unicode * IBM_quad_AV();
151 
152    /// search uni in \b inverse_ibm_av and return its position in
153    /// the IBM ⎕AV (= its code in .ATF files). Return 0xB0 if not found.
154    static unsigned char unicode_to_cp(Unicode uni);
155 
156    /// recompute \b inverse_ibm_av from \b ibm_av and print it
157    static void print_inverse_IBM_quad_AV();
158 
159 protected:
160    /// a Unicode and its position in the ⎕AV of IBM APL2
161    struct Unicode_to_IBM_codepoint
162       {
163          uint32_t uni;   ///< the Unicode
164          uint32_t cp;    ///< the IBM char for uni
165       };
166 
167    /// Unicode_to_IBM_codepoint table sorted by Unicode (for bsearch())
168    static Unicode_to_IBM_codepoint inverse_ibm_av[];
169 
170    /// print an error position on cerr, and then Assert(0);
171    static void show_error_pos(int i, int line, bool cond, int def_line);
172 
173    /// check that the character table that is used in this class is correct
174    static void check_av_table();
175 
176    /// check that all characters in the UTF-8 encoded file are known
177    /// (through char_def() or char_df1() macros)
178    static void check_file(const char * filename);
179 
180    /// compare the unicodes of two entries ua and u2 in \b inverse_IBM_quad_AV
181    static int compare_uni(const void * u1, const void * u2);
182 };
183 
184 #endif // __AVEC_HH_DEFINED__
185