1 /*
2     This file is part of GNU APL, a free implementation of the
3     ISO/IEC Standard 13751, "Programming Language APL, Extended"
4 
5     Copyright (C) 2008-2016  Dr. Jürgen Sauermann
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef __UCS_STRING_HH_DEFINED__
22 #define __UCS_STRING_HH_DEFINED__
23 
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <string>
27 
28 #include "Assert.hh"
29 #include "Common.hh"
30 #include "Heapsort.hh"
31 #include "Unicode.hh"
32 #include "UTF8_string.hh"
33 
34 using namespace std;
35 
36 class PrintBuffer;
37 class PrintContext;
38 class Shape;
39 class Value;
40 class UCS_string_vector;
41 
42 /// track construction and destruction of UCS_strings
43 #define UCS_tracking 0
44 
45 //=============================================================================
46 /// A string of Unicode characters (32-bit)
47 class UCS_string : public  basic_string<Unicode>
48 {
49 public:
50    /// default constructor: empty string
51    UCS_string();
52 
53    /// constructor: one-element string
54    UCS_string(Unicode uni);
55 
56    /// constructor: \b len Unicode characters, starting at \b data
57    UCS_string(const Unicode * data, size_t len);
58 
59    /// constructor: \b len times \b uni
60    UCS_string(size_t len, Unicode uni);
61 
62    /// constructor: copy of another UCS_string
63    UCS_string(const UCS_string & ucs);
64 
65    /// constructor: copy of another UCS_string
66    UCS_string(const UCS_string & ucs, size_t pos, size_t len);
67 
68    /// constructor: UCS_string from UTF8_string
69    UCS_string(const UTF8_string & utf);
70 
71    /// constructor: UCS_string from 0-terminated C string
72    UCS_string(const char * cstring);
73 
74    /// constructor: UCS_string from print buffer
75    UCS_string(const PrintBuffer & pb, Rank rank, int quad_PW);
76 
77    /// constructor: UCS_string from a double with quad_pp valid digits.
78    /// (eg. 3.33 has 3 digits), In standard APL format.
79    UCS_string(APL_Float value, bool & scaled, const PrintContext & pctx);
80 
81    /// constructor: read one line from UTF8-encoded file.
82    UCS_string(istream & in);
83 
84    /// constructor: UCS_string from simple character vector value.
85    UCS_string(const Value & value);
86 
87 #if UCS_tracking
88    /// common part of all constructors
89    void create(const char * loc);
90 
91    /// destructor
92    ~UCS_string();
93 #else
94    /// common part of all constructors
create(const char * loc)95    void create(const char * loc)   { ++total_count; }
96 
97    /// destructor
~UCS_string()98    ~UCS_string()                   { --total_count; }
99 #endif
100 
101    /// cast to an array of items with the same size as Unicode. This is for
102    /// interfacing to libraries that have typedef'ed Unicodes differently.
103    template<typename T>
raw() const104    const T * raw() const
105       {
106         Assert(sizeof(T) == sizeof(Unicode));
107         return reinterpret_cast<const T *>(&at(0));
108       }
109 
110    /// compute the length of an output row
111    int compute_chunk_length(int quad_PW, int col) const;
112 
113    /// remove trailing pad characters
114    void remove_trailing_padchars();
115 
116    /// remove trailing blanks, tabs, etc
117    void remove_trailing_whitespaces();
118 
119    /// remove leading blanks, tabs, etc
120    void remove_leading_whitespaces();
121 
122    /// remove leading and trailing whitespaces
remove_leading_and_trailing_whitespaces()123    void remove_leading_and_trailing_whitespaces()
124       {
125         remove_trailing_whitespaces();
126         remove_leading_whitespaces();
127       }
128 
129    /// skip leading whitespaces starting at idx, append the following
130    /// non-whitespaces (if any) to \b dest, and skip trailing whitespaces
131    void copy_black(UCS_string & dest, int & idx) const;
132 
133    /// \b this is a command with optional args. Remove leading and trailing
134    /// whitespaces, append args to rest, and remove args from this.
135    void split_ws(UCS_string & rest);
136 
137    /// return the number of LF characters in \b this string
138    ShapeItem LF_count() const;
139 
140    /// return the start position of \b sub in \b this string or -1 if \b sub
141    /// is not contained in \b this string
142    ShapeItem substr_pos(const UCS_string & sub) const;
143 
144    /// return this string with the first \b drop_count characters removed
drop(int drop_count) const145    UCS_string drop(int drop_count) const
146       {
147         if (drop_count <= 0)        return UCS_string(*this, 0, size());
148         if (size() <= drop_count)   return UCS_string();
149         return UCS_string(*this, drop_count, size() - drop_count);
150       }
151 
152    /// return the last character in \b this string
back() const153    Unicode back() const
154     { return size() ? (*this)[size() - 1] : Invalid_Unicode; }
155 
156    /// return the last character in \b this string
back()157    Unicode & back()
158     { Assert(size());   return at(size() - 1); }
159 
160    /// return true if this string contains non-whitespace characters
161    bool has_black() const;
162 
163    /// return true if \b this starts with prefix (ASCII, case matters).
164    bool starts_with(const char * prefix) const;
165 
166    /// return true if \b this ends with suffix (ASCII, case matters).
167    bool ends_with(const char * suffix) const;
168 
169    /// return true if \b this starts with \b prefix (case sensitive).
170    bool starts_with(const UCS_string & prefix) const;
171 
172    /// return true if \b this starts with \b prefix (ASCII, case insensitive).
173    bool starts_iwith(const char * prefix) const;
174 
175    /// return true if \b this starts with \b prefix (case insensitive).
176    bool starts_iwith(const UCS_string & prefix) const;
177 
178    /// return a string like this, but with pad chars mapped to spaces
179    UCS_string no_pad() const;
180 
181    /// replace pad chars in \b this string by spaces
182    void map_pad();
183 
184    /// return a string like this, but with pad chars removed
185    UCS_string remove_pad() const;
186 
187    /// remove the last character in \b this string
pop_back()188    void pop_back()
189    { Assert(size() > 0);   resize(size() - 1); }
190 
191    /// return this string reversed (i.e. characters from back to front).
192    UCS_string reverse() const;
193 
194    /// return true if \b this string starts with # or ⍝ or x:
195    bool is_comment_or_label() const;
196 
197    /// return true if every character in \b this string is the digit '0'
all_zeroes() const198    bool all_zeroes() const
199       { loop(s, size())   if ((*this)[s] != UNI_ASCII_0)   return false;
200         return true;
201       }
202 
203    /// return the number of unescaped and un-commented " in this string
204    ShapeItem double_quote_count(bool in_quote2) const;
205 
206    /// return the position of the first (leftmost) unescaped " in \b this
207    /// string (if any), or else -1
208    ShapeItem double_quote_first() const;
209 
210    /// return the position of the last (rightmost) unescaped " in \b this
211    /// string (if any), or else -1
212    ShapeItem double_quote_last() const;
213 
214    /// return integer value for a string starting with optional whitespaces,
215    /// followed by digits.
216    int atoi() const;
217 
218    /// append UCS_string other to this string
append(const UCS_string & other)219    void append(const UCS_string & other)
220       { basic_string::append(other); }
221 
222    /// append 0-terminated ASCII string \b str to this string. str is NOT
223    /// interpreted as UTF8 string (use append_UTF8() if such interpretation        /// is desired)
append_ASCII(const char * ascii)224    void append_ASCII(const char * ascii)
225       { while (*ascii)   *this += Unicode(*ascii++); }
226 
227 
228    /// append 0-terminated UTF8 string str to \b this UCS_string.
229    // This is different from append_ASCII((const char * str):
230    ///
231    /// append_ascii() appends one Unicode per byte (i.e. strlen(str) in total),
232    /// without checking for UTF8 sequences.
233    ///
234    /// append_UTF8() appends one Unicode per UTF8 sequence (which is the same
235    /// if all characteras are ASCII, but less if not.
236    void append_UTF8(const UTF8 * str);
237 
238    /// same as app(const UTF8 * str)
append_UTF8(const char * str)239    void append_UTF8(const char * str)
240       { append_UTF8(utf8P(str)); }
241 
242    /// more intuitive insert() function
insert(ShapeItem pos,Unicode uni)243    void insert(ShapeItem pos, Unicode uni)
244       { basic_string::insert(pos, 1, uni); }
245 
246    /// prepend character \b uni
prepend(Unicode uni)247    void prepend(Unicode uni)
248       { insert(0, uni); }
249 
250    /// return \b this string and \b other concatenated
operator +(const UCS_string & other) const251    UCS_string operator +(const UCS_string & other) const
252       { UCS_string ret(*this);   ret += other;   return ret; }
253 
254    /// append C-string \b str
operator <<(const char * str)255    UCS_string & operator <<(const char * str)
256       { append_UTF8(str);   return *this; }
257 
258    /// append number \b num
operator <<(ShapeItem num)259    UCS_string & operator <<(ShapeItem num)
260       { append_number(num);   return *this; }
261 
262    /// append character \b uni
operator <<(Unicode uni)263    UCS_string & operator <<(Unicode uni)
264       { *this += uni;   return *this; }
265 
266    /// append UCS_string \b other
operator <<(const UCS_string & other)267    UCS_string & operator <<(const UCS_string & other)
268       { basic_string::append(other);   return *this; }
269 
270    /// compare \b this with UCS_string \b other
compare(const UCS_string & other) const271    Comp_result compare(const UCS_string & other) const
272       {
273         if (*this < other)   return COMP_LT;
274         if (*this > other)   return COMP_GT;
275         return COMP_EQ;
276       }
277 
278    /// append \b other in quotes, doubling quoted in \b other
279    void append_quoted(const UCS_string & other);
280 
281    /// append number (in ASCII encoding like %d) to this string
282    void append_number(ShapeItem num);
283 
284    /// append number (in ASCII encoding like %X or %x) to this string
285    void append_hex(ShapeItem num, bool uppercase);
286 
287    /// append shape (in APL encoding tke left arg of ↑) this string
288    void append_shape(const Shape & shape);
289 
290    /// append number (in ASCII encoding like %lf) to this string
291    void append_float(APL_Float num);
292 
293    /// split \b this multi-line string into individual lines,
294    /// removing the CR and NL chars in \b this string.
295    size_t to_vector(UCS_string_vector & result) const;
296 
297    /// return \b this string with "escape sequences" replaced by their real
298    /// characters ('' → ' if single quoted and \\r, \\n, \\xNNN etc. otherwise.
299    UCS_string un_escape(bool double_quoted, bool keep_LF) const;
300 
301    /// the inverse of \b un_escape().
302    UCS_string do_escape(bool double_quoted) const;
303 
304    /// overload basic_string::size() so that it returns a signed length
size() const305    ShapeItem size() const
306       { return basic_string::size(); }
307 
308    /// an iterator for UCS_strings
309    class iterator
310       {
311         public:
312            /// constructor: start at position p
iterator(const UCS_string & ucs,int p)313            iterator(const UCS_string & ucs, int p)
314            : s(ucs),
315              pos(p)
316            {}
317 
318            /// return char at offset off from current position
get(int off=0) const319            Unicode get(int off = 0) const
320               { return (pos + off) < s.size() ? s[pos+off] : Invalid_Unicode; }
321 
322            /// return next char
next()323            Unicode next()
324               { return pos < s.size() ? s[pos++] : Invalid_Unicode; }
325 
326            /// return true iff there are more chars in the string
more() const327            bool more() const
328               { return pos < s.size(); }
329 
330         protected:
331            /// the string
332            const UCS_string & s;
333 
334            /// the current position in the string
335            int pos;
336       };
337 
338    /// an iterator set to the start of this string
begin() const339    UCS_string::iterator begin() const
340       { return iterator(*this, 0); }
341 
342    /// round last digit and discard it.
343    void round_last_digit();
344 
345    /// return true if \b this string contains \b uni
346    bool contains(Unicode uni);
347 
348    /// case-sensitive comparison: return true iff \b this comes before \b other
349    bool lexical_before(const UCS_string other) const;
350 
351    /// dump \b this string to out (like U+nnn U+mmm ... )
352    ostream & dump(ostream & out) const;
353 
354    /// sort the characters in this string by their Unicode
355    UCS_string sort() const;
356 
357    /// return the characters in this string (sorted and duplicates removed)
358    UCS_string unique() const;
359 
360    /// return this string HTML-escaped, starting at offset, maybe using &nbsp;
361    UCS_string to_HTML(int offset, bool preserve_ws) const;
362 
363    /// erase 1 (!) character at pos
erase(ShapeItem pos)364    void erase(ShapeItem pos)
365       { basic_string::erase(pos, 1); }
366 
367    /// helper function for Heapsort<Unicode>::sort()
greater_uni(const Unicode & u1,const Unicode & u2,const void *)368    static bool greater_uni(const Unicode & u1, const Unicode & u2, const void *)
369       { return u1 > u2; }
370 
371    /// convert a signed integer value to an UCS_string (like sprintf())
372    static UCS_string from_int(int64_t value);
373 
374    /// convert an unsigned integer value to an UCS_string (like sprintf())
375    static UCS_string from_uint(uint64_t value);
376 
377    /// convert the integer part of value to an UCS_string and remove it
378    /// from value
379    static UCS_string from_big(APL_Float & value);
380 
381    /// convert double \b value to an UCS_string with \b fract_digits fractional
382    /// digits in scaled (exponential) format
383    static UCS_string from_double_expo_prec(APL_Float value, int fract_digits);
384 
385    /// convert double \b value to an UCS_string with \b fract_digits fractional
386    /// digits in fixed point format
387    static UCS_string from_double_fixed_prec(APL_Float value, int fract_digits);
388 
389    /// convert double \b value to an UCS_string with \b quad_pp significant
390    /// digits in scaled (exponential) format
391    static UCS_string from_double_expo_pp(APL_Float value, int quad_pp);
392 
393    /// convert double \b value to an UCS_string with \b quad_pp significant
394    /// digits in fixed point format
395    static UCS_string from_double_fixed_pp(APL_Float value, int quad_pp);
396 
397    /// return the total number of UCS_strings
get_total_count()398    static ShapeItem get_total_count()
399       { return total_count; }
400 
401    /// return true if n1 < n2
compare_names(const UCS_string & n1,const UCS_string & n2,const void *)402    static bool compare_names(const UCS_string & n1,
403                              const UCS_string & n2, const void *)
404       { return n2.compare(n1) == COMP_LT; }
405 
406 protected:
407    /// the total number of UCS_strings
408    static ShapeItem total_count;
409 
410    /// the next unique instance_id
411    static ShapeItem total_id;
412 
413 #if UCS_tracking
414    /// a unique number for \b this  UCS_string
415    ShapeItem instance_id;
416 #endif
417 
418 private:
419    /// prevent accidental allocation
420    void * operator new[](size_t size);
421 
422    /// prevent accidental de-allocation
423    void operator delete[](void *);
424 
425 private:
426    /// prevent accidental usage of the rather dangerous default len parameter
427    /// in basic_strng::erase(pos, len = npos)
428    basic_string & erase(size_type pos, size_type len);
429 };
430 //-----------------------------------------------------------------------------
431 inline void
Hswap(const UCS_string * & u1,const UCS_string * & u2)432 Hswap(const UCS_string * & u1, const UCS_string * & u2)
433 {
434 const UCS_string * tmp = u1;   u1 = u2;   u2 = tmp;
435 }
436 //-----------------------------------------------------------------------------
437 inline void
Hswap(UCS_string & u1,UCS_string & u2)438 Hswap(UCS_string & u1, UCS_string & u2)
439 {
440 UCS_string  u = u1;
441            u1 = u2;
442            u2 = u;
443 }
444 //-----------------------------------------------------------------------------
445 
446 #endif // __UCS_STRING_HH_DEFINED__
447 
448