1 /*
2 This file is part of GNU APL, a free implementation of the
3 ISO/IEC Standard 13751, "Programming Language APL, Extended"
4
5 Copyright (C) 2008-2016 Dr. Jürgen Sauermann
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #ifndef __UCS_STRING_HH_DEFINED__
22 #define __UCS_STRING_HH_DEFINED__
23
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <string>
27
28 #include "Assert.hh"
29 #include "Common.hh"
30 #include "Heapsort.hh"
31 #include "Unicode.hh"
32 #include "UTF8_string.hh"
33
34 using namespace std;
35
36 class PrintBuffer;
37 class PrintContext;
38 class Shape;
39 class Value;
40 class UCS_string_vector;
41
42 /// track construction and destruction of UCS_strings
43 #define UCS_tracking 0
44
45 //=============================================================================
46 /// A string of Unicode characters (32-bit)
47 class UCS_string : public basic_string<Unicode>
48 {
49 public:
50 /// default constructor: empty string
51 UCS_string();
52
53 /// constructor: one-element string
54 UCS_string(Unicode uni);
55
56 /// constructor: \b len Unicode characters, starting at \b data
57 UCS_string(const Unicode * data, size_t len);
58
59 /// constructor: \b len times \b uni
60 UCS_string(size_t len, Unicode uni);
61
62 /// constructor: copy of another UCS_string
63 UCS_string(const UCS_string & ucs);
64
65 /// constructor: copy of another UCS_string
66 UCS_string(const UCS_string & ucs, size_t pos, size_t len);
67
68 /// constructor: UCS_string from UTF8_string
69 UCS_string(const UTF8_string & utf);
70
71 /// constructor: UCS_string from 0-terminated C string
72 UCS_string(const char * cstring);
73
74 /// constructor: UCS_string from print buffer
75 UCS_string(const PrintBuffer & pb, Rank rank, int quad_PW);
76
77 /// constructor: UCS_string from a double with quad_pp valid digits.
78 /// (eg. 3.33 has 3 digits), In standard APL format.
79 UCS_string(APL_Float value, bool & scaled, const PrintContext & pctx);
80
81 /// constructor: read one line from UTF8-encoded file.
82 UCS_string(istream & in);
83
84 /// constructor: UCS_string from simple character vector value.
85 UCS_string(const Value & value);
86
87 #if UCS_tracking
88 /// common part of all constructors
89 void create(const char * loc);
90
91 /// destructor
92 ~UCS_string();
93 #else
94 /// common part of all constructors
create(const char * loc)95 void create(const char * loc) { ++total_count; }
96
97 /// destructor
~UCS_string()98 ~UCS_string() { --total_count; }
99 #endif
100
101 /// cast to an array of items with the same size as Unicode. This is for
102 /// interfacing to libraries that have typedef'ed Unicodes differently.
103 template<typename T>
raw() const104 const T * raw() const
105 {
106 Assert(sizeof(T) == sizeof(Unicode));
107 return reinterpret_cast<const T *>(&at(0));
108 }
109
110 /// compute the length of an output row
111 int compute_chunk_length(int quad_PW, int col) const;
112
113 /// remove trailing pad characters
114 void remove_trailing_padchars();
115
116 /// remove trailing blanks, tabs, etc
117 void remove_trailing_whitespaces();
118
119 /// remove leading blanks, tabs, etc
120 void remove_leading_whitespaces();
121
122 /// remove leading and trailing whitespaces
remove_leading_and_trailing_whitespaces()123 void remove_leading_and_trailing_whitespaces()
124 {
125 remove_trailing_whitespaces();
126 remove_leading_whitespaces();
127 }
128
129 /// skip leading whitespaces starting at idx, append the following
130 /// non-whitespaces (if any) to \b dest, and skip trailing whitespaces
131 void copy_black(UCS_string & dest, int & idx) const;
132
133 /// \b this is a command with optional args. Remove leading and trailing
134 /// whitespaces, append args to rest, and remove args from this.
135 void split_ws(UCS_string & rest);
136
137 /// return the number of LF characters in \b this string
138 ShapeItem LF_count() const;
139
140 /// return the start position of \b sub in \b this string or -1 if \b sub
141 /// is not contained in \b this string
142 ShapeItem substr_pos(const UCS_string & sub) const;
143
144 /// return this string with the first \b drop_count characters removed
drop(int drop_count) const145 UCS_string drop(int drop_count) const
146 {
147 if (drop_count <= 0) return UCS_string(*this, 0, size());
148 if (size() <= drop_count) return UCS_string();
149 return UCS_string(*this, drop_count, size() - drop_count);
150 }
151
152 /// return the last character in \b this string
back() const153 Unicode back() const
154 { return size() ? (*this)[size() - 1] : Invalid_Unicode; }
155
156 /// return the last character in \b this string
back()157 Unicode & back()
158 { Assert(size()); return at(size() - 1); }
159
160 /// return true if this string contains non-whitespace characters
161 bool has_black() const;
162
163 /// return true if \b this starts with prefix (ASCII, case matters).
164 bool starts_with(const char * prefix) const;
165
166 /// return true if \b this ends with suffix (ASCII, case matters).
167 bool ends_with(const char * suffix) const;
168
169 /// return true if \b this starts with \b prefix (case sensitive).
170 bool starts_with(const UCS_string & prefix) const;
171
172 /// return true if \b this starts with \b prefix (ASCII, case insensitive).
173 bool starts_iwith(const char * prefix) const;
174
175 /// return true if \b this starts with \b prefix (case insensitive).
176 bool starts_iwith(const UCS_string & prefix) const;
177
178 /// return a string like this, but with pad chars mapped to spaces
179 UCS_string no_pad() const;
180
181 /// replace pad chars in \b this string by spaces
182 void map_pad();
183
184 /// return a string like this, but with pad chars removed
185 UCS_string remove_pad() const;
186
187 /// remove the last character in \b this string
pop_back()188 void pop_back()
189 { Assert(size() > 0); resize(size() - 1); }
190
191 /// return this string reversed (i.e. characters from back to front).
192 UCS_string reverse() const;
193
194 /// return true if \b this string starts with # or ⍝ or x:
195 bool is_comment_or_label() const;
196
197 /// return true if every character in \b this string is the digit '0'
all_zeroes() const198 bool all_zeroes() const
199 { loop(s, size()) if ((*this)[s] != UNI_ASCII_0) return false;
200 return true;
201 }
202
203 /// return the number of unescaped and un-commented " in this string
204 ShapeItem double_quote_count(bool in_quote2) const;
205
206 /// return the position of the first (leftmost) unescaped " in \b this
207 /// string (if any), or else -1
208 ShapeItem double_quote_first() const;
209
210 /// return the position of the last (rightmost) unescaped " in \b this
211 /// string (if any), or else -1
212 ShapeItem double_quote_last() const;
213
214 /// return integer value for a string starting with optional whitespaces,
215 /// followed by digits.
216 int atoi() const;
217
218 /// append UCS_string other to this string
append(const UCS_string & other)219 void append(const UCS_string & other)
220 { basic_string::append(other); }
221
222 /// append 0-terminated ASCII string \b str to this string. str is NOT
223 /// interpreted as UTF8 string (use append_UTF8() if such interpretation /// is desired)
append_ASCII(const char * ascii)224 void append_ASCII(const char * ascii)
225 { while (*ascii) *this += Unicode(*ascii++); }
226
227
228 /// append 0-terminated UTF8 string str to \b this UCS_string.
229 // This is different from append_ASCII((const char * str):
230 ///
231 /// append_ascii() appends one Unicode per byte (i.e. strlen(str) in total),
232 /// without checking for UTF8 sequences.
233 ///
234 /// append_UTF8() appends one Unicode per UTF8 sequence (which is the same
235 /// if all characteras are ASCII, but less if not.
236 void append_UTF8(const UTF8 * str);
237
238 /// same as app(const UTF8 * str)
append_UTF8(const char * str)239 void append_UTF8(const char * str)
240 { append_UTF8(utf8P(str)); }
241
242 /// more intuitive insert() function
insert(ShapeItem pos,Unicode uni)243 void insert(ShapeItem pos, Unicode uni)
244 { basic_string::insert(pos, 1, uni); }
245
246 /// prepend character \b uni
prepend(Unicode uni)247 void prepend(Unicode uni)
248 { insert(0, uni); }
249
250 /// return \b this string and \b other concatenated
operator +(const UCS_string & other) const251 UCS_string operator +(const UCS_string & other) const
252 { UCS_string ret(*this); ret += other; return ret; }
253
254 /// append C-string \b str
operator <<(const char * str)255 UCS_string & operator <<(const char * str)
256 { append_UTF8(str); return *this; }
257
258 /// append number \b num
operator <<(ShapeItem num)259 UCS_string & operator <<(ShapeItem num)
260 { append_number(num); return *this; }
261
262 /// append character \b uni
operator <<(Unicode uni)263 UCS_string & operator <<(Unicode uni)
264 { *this += uni; return *this; }
265
266 /// append UCS_string \b other
operator <<(const UCS_string & other)267 UCS_string & operator <<(const UCS_string & other)
268 { basic_string::append(other); return *this; }
269
270 /// compare \b this with UCS_string \b other
compare(const UCS_string & other) const271 Comp_result compare(const UCS_string & other) const
272 {
273 if (*this < other) return COMP_LT;
274 if (*this > other) return COMP_GT;
275 return COMP_EQ;
276 }
277
278 /// append \b other in quotes, doubling quoted in \b other
279 void append_quoted(const UCS_string & other);
280
281 /// append number (in ASCII encoding like %d) to this string
282 void append_number(ShapeItem num);
283
284 /// append number (in ASCII encoding like %X or %x) to this string
285 void append_hex(ShapeItem num, bool uppercase);
286
287 /// append shape (in APL encoding tke left arg of ↑) this string
288 void append_shape(const Shape & shape);
289
290 /// append number (in ASCII encoding like %lf) to this string
291 void append_float(APL_Float num);
292
293 /// split \b this multi-line string into individual lines,
294 /// removing the CR and NL chars in \b this string.
295 size_t to_vector(UCS_string_vector & result) const;
296
297 /// return \b this string with "escape sequences" replaced by their real
298 /// characters ('' → ' if single quoted and \\r, \\n, \\xNNN etc. otherwise.
299 UCS_string un_escape(bool double_quoted, bool keep_LF) const;
300
301 /// the inverse of \b un_escape().
302 UCS_string do_escape(bool double_quoted) const;
303
304 /// overload basic_string::size() so that it returns a signed length
size() const305 ShapeItem size() const
306 { return basic_string::size(); }
307
308 /// an iterator for UCS_strings
309 class iterator
310 {
311 public:
312 /// constructor: start at position p
iterator(const UCS_string & ucs,int p)313 iterator(const UCS_string & ucs, int p)
314 : s(ucs),
315 pos(p)
316 {}
317
318 /// return char at offset off from current position
get(int off=0) const319 Unicode get(int off = 0) const
320 { return (pos + off) < s.size() ? s[pos+off] : Invalid_Unicode; }
321
322 /// return next char
next()323 Unicode next()
324 { return pos < s.size() ? s[pos++] : Invalid_Unicode; }
325
326 /// return true iff there are more chars in the string
more() const327 bool more() const
328 { return pos < s.size(); }
329
330 protected:
331 /// the string
332 const UCS_string & s;
333
334 /// the current position in the string
335 int pos;
336 };
337
338 /// an iterator set to the start of this string
begin() const339 UCS_string::iterator begin() const
340 { return iterator(*this, 0); }
341
342 /// round last digit and discard it.
343 void round_last_digit();
344
345 /// return true if \b this string contains \b uni
346 bool contains(Unicode uni);
347
348 /// case-sensitive comparison: return true iff \b this comes before \b other
349 bool lexical_before(const UCS_string other) const;
350
351 /// dump \b this string to out (like U+nnn U+mmm ... )
352 ostream & dump(ostream & out) const;
353
354 /// sort the characters in this string by their Unicode
355 UCS_string sort() const;
356
357 /// return the characters in this string (sorted and duplicates removed)
358 UCS_string unique() const;
359
360 /// return this string HTML-escaped, starting at offset, maybe using
361 UCS_string to_HTML(int offset, bool preserve_ws) const;
362
363 /// erase 1 (!) character at pos
erase(ShapeItem pos)364 void erase(ShapeItem pos)
365 { basic_string::erase(pos, 1); }
366
367 /// helper function for Heapsort<Unicode>::sort()
greater_uni(const Unicode & u1,const Unicode & u2,const void *)368 static bool greater_uni(const Unicode & u1, const Unicode & u2, const void *)
369 { return u1 > u2; }
370
371 /// convert a signed integer value to an UCS_string (like sprintf())
372 static UCS_string from_int(int64_t value);
373
374 /// convert an unsigned integer value to an UCS_string (like sprintf())
375 static UCS_string from_uint(uint64_t value);
376
377 /// convert the integer part of value to an UCS_string and remove it
378 /// from value
379 static UCS_string from_big(APL_Float & value);
380
381 /// convert double \b value to an UCS_string with \b fract_digits fractional
382 /// digits in scaled (exponential) format
383 static UCS_string from_double_expo_prec(APL_Float value, int fract_digits);
384
385 /// convert double \b value to an UCS_string with \b fract_digits fractional
386 /// digits in fixed point format
387 static UCS_string from_double_fixed_prec(APL_Float value, int fract_digits);
388
389 /// convert double \b value to an UCS_string with \b quad_pp significant
390 /// digits in scaled (exponential) format
391 static UCS_string from_double_expo_pp(APL_Float value, int quad_pp);
392
393 /// convert double \b value to an UCS_string with \b quad_pp significant
394 /// digits in fixed point format
395 static UCS_string from_double_fixed_pp(APL_Float value, int quad_pp);
396
397 /// return the total number of UCS_strings
get_total_count()398 static ShapeItem get_total_count()
399 { return total_count; }
400
401 /// return true if n1 < n2
compare_names(const UCS_string & n1,const UCS_string & n2,const void *)402 static bool compare_names(const UCS_string & n1,
403 const UCS_string & n2, const void *)
404 { return n2.compare(n1) == COMP_LT; }
405
406 protected:
407 /// the total number of UCS_strings
408 static ShapeItem total_count;
409
410 /// the next unique instance_id
411 static ShapeItem total_id;
412
413 #if UCS_tracking
414 /// a unique number for \b this UCS_string
415 ShapeItem instance_id;
416 #endif
417
418 private:
419 /// prevent accidental allocation
420 void * operator new[](size_t size);
421
422 /// prevent accidental de-allocation
423 void operator delete[](void *);
424
425 private:
426 /// prevent accidental usage of the rather dangerous default len parameter
427 /// in basic_strng::erase(pos, len = npos)
428 basic_string & erase(size_type pos, size_type len);
429 };
430 //-----------------------------------------------------------------------------
431 inline void
Hswap(const UCS_string * & u1,const UCS_string * & u2)432 Hswap(const UCS_string * & u1, const UCS_string * & u2)
433 {
434 const UCS_string * tmp = u1; u1 = u2; u2 = tmp;
435 }
436 //-----------------------------------------------------------------------------
437 inline void
Hswap(UCS_string & u1,UCS_string & u2)438 Hswap(UCS_string & u1, UCS_string & u2)
439 {
440 UCS_string u = u1;
441 u1 = u2;
442 u2 = u;
443 }
444 //-----------------------------------------------------------------------------
445
446 #endif // __UCS_STRING_HH_DEFINED__
447
448