1 /*
2     This file is part of GNU APL, a free implementation of the
3     ISO/IEC Standard 13751, "Programming Language APL, Extended"
4 
5     Copyright (C) 2008-2019  Dr. Jürgen Sauermann
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include <math.h>
22 #include <string.h>
23 #include <vector>
24 
25 #include "Backtrace.hh"
26 #include "Common.hh"
27 #include "FloatCell.hh"
28 #include "Heapsort.hh"
29 #include "Output.hh"
30 #include "PrintBuffer.hh"
31 #include "PrintOperator.hh"
32 #include "UCS_string.hh"
33 #include "UTF8_string.hh"
34 #include "Value.hh"
35 
36 ShapeItem UCS_string::total_count = 0;
37 ShapeItem UCS_string::total_id = 0;
38 
39 //-----------------------------------------------------------------------------
UCS_string()40 UCS_string::UCS_string()
41 {
42    create(LOC);
43 }
44 //-----------------------------------------------------------------------------
UCS_string(Unicode uni)45 UCS_string::UCS_string(Unicode uni)
46    : basic_string<Unicode>(1, uni)
47 {
48   create(LOC);
49 }
50 //-----------------------------------------------------------------------------
UCS_string(const Unicode * data,size_t len)51 UCS_string::UCS_string(const Unicode * data, size_t len)
52    : basic_string<Unicode>(data, len)
53 {
54    create(LOC);
55 }
56 //-----------------------------------------------------------------------------
UCS_string(size_t len,Unicode uni)57 UCS_string::UCS_string(size_t len, Unicode uni)
58    : basic_string<Unicode>(len, uni)
59 {
60    create(LOC);
61 }
62 //-----------------------------------------------------------------------------
UCS_string(const UCS_string & ucs)63 UCS_string::UCS_string(const UCS_string & ucs)
64    : basic_string<Unicode>(ucs)
65 {
66    create(LOC);
67 }
68 //-----------------------------------------------------------------------------
UCS_string(const UCS_string & ucs,size_t pos,size_t len)69 UCS_string::UCS_string(const UCS_string & ucs, size_t pos, size_t len)
70    : basic_string<Unicode>(ucs, pos, len)
71 {
72    create(LOC);
73 }
74 //-----------------------------------------------------------------------------
UCS_string(const char * cstring)75 UCS_string::UCS_string(const char * cstring)
76 {
77    // calling this constructor with and utf8-encoded C string is usually wrong.
78    //
79    // Instead of:   UCS_string(const char * utf8_string)
80    // one should:   UCS_string(UTF8_string(utf8_string))
81    //
82    // so that the constructor below takes care of the utf8-decoding.
83    //
84    // For ASCII C strings this constuctor is fine.
85    //
86    create(LOC);
87 
88    while (*cstring)
89       {
90         Assert((0x80 & *cstring) == 0);   // ASCII
91         *this += Unicode(*cstring++);
92       }
93 }
94 //-----------------------------------------------------------------------------
UCS_string(const UTF8_string & utf)95 UCS_string::UCS_string(const UTF8_string & utf)
96 {
97    create(LOC);
98 
99    Log(LOG_char_conversion)
100       CERR << "UCS_string::UCS_string(): utf = " << utf << endl;
101 
102    for (size_t i = 0; i < utf.size();)
103       {
104         const uint32_t b0 = utf[i++];
105         uint32_t bx = b0;
106         uint32_t more;
107         if      ((b0 & 0x80) == 0x00)   { more = 0;             }
108         else if ((b0 & 0xE0) == 0xC0)   { more = 1; bx &= 0x1F; }
109         else if ((b0 & 0xF0) == 0xE0)   { more = 2; bx &= 0x0F; }
110         else if ((b0 & 0xF8) == 0xF0)   { more = 3; bx &= 0x0E; }
111         else if ((b0 & 0xFC) == 0xF8)   { more = 4; bx &= 0x07; }
112         else if ((b0 & 0xFE) == 0xFC)   { more = 5; bx &= 0x03; }
113         else
114            {
115              utf.dump_hex(CERR << "Bad UTF8 string: ", 40)
116                                << " at " << LOC <<  endl;
117              Backtrace::show(__FILE__, __LINE__);
118              return;
119            }
120 
121         uint32_t uni = 0;
122         for (; more; --more)
123             {
124               if (i >= utf.size())
125                  {
126                    utf.dump_hex(CERR << "Truncated UTF8 string: ", 40)
127                       << " len " << utf.size() << " at " << LOC <<  endl;
128                    if (utf.size() >= 40)
129                       {
130                          const UTF8_string end(&utf[utf.size() - 10], 10);
131                          end.dump_hex(CERR << endl << "(ending with : ", 20)
132                                            << ")" << endl;
133                       }
134                    return;
135                  }
136 
137               const UTF8 subc = utf[i++];
138               if ((subc & 0xC0) != 0x80)
139                  {
140                    utf.dump_hex(CERR << "Bad UTF8 string: ", 40)
141                       << " len " << utf.size() << " at " << LOC <<  endl;
142                    if (utf.size() >= 40)
143                       {
144                          const UTF8_string end(&utf[utf.size() - 10], 10);
145                          end.dump_hex(CERR << endl << "(ending with : ", 20)
146                                            << ")" << endl;
147                       }
148                    Backtrace::show(__FILE__, __LINE__);
149                    return;
150                  }
151 
152               bx  <<= 6;
153               uni <<= 6;
154               uni |= subc & 0x3F;
155             }
156 
157          append(Unicode(bx | uni));
158       }
159 
160    Log(LOG_char_conversion)
161       CERR << "UCS_string::UCS_string(): ucs = " << *this << endl;
162 }
163 //-----------------------------------------------------------------------------
UCS_string(APL_Float value,bool & scaled,const PrintContext & pctx)164 UCS_string::UCS_string(APL_Float value, bool & scaled,
165                        const PrintContext & pctx)
166 {
167    create(LOC);
168 
169 int quad_pp = pctx.get_PP();
170    if (quad_pp > MAX_Quad_PP)   quad_pp = MAX_Quad_PP;
171    if (quad_pp < MIN_Quad_PP)   quad_pp = MIN_Quad_PP;
172 
173 const bool negative = (value < 0.0);
174    if (negative)   value = -value;
175 
176 int expo = 0;
177 
178    if (value >= 10.0)   // large number, positive exponent
179       {
180         if (value > 1e307)
181            {
182              if (negative)   append_UTF8("¯∞");
183              else            append_UTF8("∞");
184              FloatCell::map_FC(*this);
185              return;
186            }
187 
188        while (value >= 1e16)   { value *= 1e-16;   expo += 16; }
189        while (value >= 1e4)    { value *= 1e-4;    expo +=  4; }
190        while (value >= 1e1)    { value *= 1e-1;    ++expo;     }
191       }
192    else if (value < 1.0)   // small number, negative exponent
193       {
194        if (value < 1e-305)   // very small number: make it 0
195           {
196             append(UNI_ASCII_0);
197             return;
198           }
199 
200        while (value < 1e-16)   { value *= 1e16;   expo -= 16; }
201        while (value < 1e-4)    { value *= 1e4;    expo -=  4; }
202        while (value < 1.0)     { value *= 10.0;   --expo;     }
203       }
204 
205    // In theory, at this point, 1.0 ≤ value < 10.0. In reality value can
206    // be outside, though, due to rounding errors.
207 
208    // create a string with quad_pp + 1 significant digits.
209    // The last digit is used for rounding and then discarded.
210    //
211 UCS_string digits;
212    loop(d, (quad_pp + 2))
213       {
214         if (value >= 10.0)
215            {
216              // 10.0 or more is a rounding error from 9,999...
217              digits.append(Unicode(10 + '0'));
218              while (digits.size() < (quad_pp + 2))   digits.append(UNI_ASCII_0);
219              break;
220            }
221         else if (value < 0.0)
222            {
223              // less than 0.0 is a rounding error from 0.000...
224              while (digits.size() < (quad_pp + 2))   digits.append(UNI_ASCII_0);
225              break;
226              digits.append(UNI_ASCII_0);
227              value = 0.0;
228            }
229         else
230            {
231              const int dig = int(value);
232              value -= dig;
233              value *= 10.0;
234              digits.append(Unicode(dig + '0'));
235            }
236       }
237 
238    if (digits[0] != '0')   digits.pop_back();
239 
240    // round last digit
241    //
242 const Unicode last = digits.back();
243    digits.pop_back();
244 
245    if (last >= '5')   digits.back() = Unicode(digits.back() + 1);
246 
247    // adjust carries of 2nd to last digit
248    //
249    for (int d = digits.size() - 1; d > 0; --d)   // all but first
250        {
251         if (digits[d] > '9')
252            {
253              digits[d] =     Unicode(digits[d]     - 10);
254              digits[d - 1] = Unicode(digits[d - 1] +  1);
255            }
256        }
257 
258    // adjust carry of 1st digit
259    //
260    if (digits[0] > '9')
261       {
262         digits[0] = Unicode(digits[0] - 10);
263         digits.insert(0, UNI_ASCII_1);
264         ++expo;
265         digits.pop_back();
266       }
267 
268    // remove trailing zeros
269    //
270    while (digits.size() > 1 && digits.back() == UNI_ASCII_0)  digits.pop_back();
271 
272    // force scaled format if:
273    //
274    // value < .00001     ( value has ≥ 5 leading zeroes)
275    // value > 10⋆quad_pp ( integer larger than ⎕PP)
276    //
277    if ((expo < -6) || (expo > quad_pp))   scaled = true;
278 
279    if (negative)   append(UNI_OVERBAR);
280 
281    if (scaled)
282       {
283         append(digits[0]);       // integer part
284         if (digits.size() > 1)   // fractional part
285            {
286              append(UNI_ASCII_FULLSTOP);
287              loop(d, (digits.size() - 1))   append(digits[d + 1]);
288            }
289         if (expo < 0)
290            {
291              append(UNI_ASCII_E);
292              append(UNI_OVERBAR);
293              append_number(-expo);
294            }
295         else if (expo > 0)
296            {
297              append(UNI_ASCII_E);
298              append_number(expo);
299            }
300         else if (!(pctx.get_style() & PST_NO_EXPO_0)) // expo == 0
301            {
302              append(UNI_ASCII_E);
303              append(UNI_ASCII_0);
304            }
305       }
306    else
307       {
308         if (expo < 0)   // 0.000...
309            {
310              append(UNI_ASCII_0);
311              append(UNI_ASCII_FULLSTOP);
312              loop(e, (-(expo + 1)))   append(UNI_ASCII_0);
313              append(digits);
314            }
315         else   // expo >= 0
316            {
317              loop(e, expo + 1)
318                 {
319                   if (e < digits.size())   append(digits[e]);
320                   else                     append(UNI_ASCII_0);
321                 }
322 
323              if ((expo + 1) < digits.size())   // there are fractional digits
324                 {
325                   append(UNI_ASCII_FULLSTOP);
326                   for (int e = expo + 1; e < digits.size(); ++e)
327                      {
328                        if (e < digits.size())   append(digits[e]);
329                        else                     break;
330                      }
331                 }
332            }
333       }
334 
335    FloatCell::map_FC(*this);
336 }
337 //-----------------------------------------------------------------------------
UCS_string(const PrintBuffer & pb,Rank rank,int quad_PW)338 UCS_string::UCS_string(const PrintBuffer & pb, Rank rank, int quad_PW)
339 {
340    create(LOC);
341 
342    if (pb.get_height() == 0)   return;      // empty PrintBuffer
343 
344 const int total_width = pb.get_width(0);
345 
346 std::vector<int> breakpoints;
347    breakpoints.reserve(2*total_width/quad_PW);
348 
349    // print rows, breaking at breakpoints
350    //
351    loop(row, pb.get_height())
352        {
353          if (row)   append(UNI_ASCII_LF);   // end previous row
354          int col = 0;
355          int b = 0;
356 
357          while (col < total_width)
358             {
359               int chunk_len;
360               if (row == 0)   // first row: set up breakpoints
361                  {
362                    chunk_len = pb.get_line(0).compute_chunk_length(quad_PW,col);
363                    breakpoints.push_back(chunk_len);
364                  }
365               else
366                  {
367                    chunk_len = breakpoints[b++];
368                  }
369 
370               if (col)   append_UTF8("\n      ");
371               UCS_string trow(pb.get_line(row), col, chunk_len);
372               trow.remove_trailing_padchars();
373               append(trow);
374 
375               col += chunk_len;
376             }
377        }
378 
379    // replace pad chars with blanks.
380    //
381    loop(u, size())
382        {
383          if (is_iPAD_char(at(u)))   at(u) = UNI_ASCII_SPACE;
384        }
385 }
386 //-----------------------------------------------------------------------------
387 /// constructor
UCS_string(const Value & value)388 UCS_string::UCS_string(const Value & value)
389 {
390    create(LOC);
391 
392    if (value.get_rank() > 1) RANK_ERROR;
393 
394 const ShapeItem ec = value.element_count();
395    reserve(ec);
396 
397    loop(e, ec)   append(value.get_ravel(e).get_char_value());
398 }
399 //-----------------------------------------------------------------------------
UCS_string(istream & in)400 UCS_string::UCS_string(istream & in)
401 {
402    create(LOC);
403 
404    for (;;)
405       {
406         const Unicode uni = UTF8_string::getc(in);
407         if (uni == Invalid_Unicode)   return;
408         if (uni == UNI_ASCII_LF)      return;
409         append(uni);
410       }
411 }
412 //-----------------------------------------------------------------------------
413 int
compute_chunk_length(int quad_PW,int col) const414 UCS_string::compute_chunk_length(int quad_PW, int col) const
415 {
416 int chunk_len = quad_PW;
417 
418    if (col)   chunk_len -= 6;   // subsequent line inden
419 
420 int pos = col + chunk_len;
421    if (pos >= size())   return size() - col;
422 
423    while (--pos > col)
424       {
425          const Unicode uni = at(pos);
426          if (uni == UNI_iPAD_U2 || uni == UNI_iPAD_U3)
427             {
428                chunk_len = pos - col + 1;
429                break;
430             }
431       }
432 
433    return chunk_len;
434 }
435 //-----------------------------------------------------------------------------
436 void
remove_trailing_padchars()437 UCS_string::remove_trailing_padchars()
438 {
439    // remove trailing pad chars from align() and append_string(),
440    // but leave other pad chars intact.
441    // But only if the line has no frame (vert).
442    //
443 
444    // If the line contains UNI_iPAD_L0 (higher dimension separator)
445    // then discard all chars.
446    //
447    loop(u, size())
448        {
449          if (at(u) == UNI_LINE_VERT)    break;
450          if (at(u) == UNI_LINE_VERT2)   break;
451          if (at(u) == UNI_iPAD_L0)
452             {
453               clear();
454               return;
455             }
456        }
457 
458    while (size())
459       {
460         const Unicode last = back();
461         if (last == UNI_iPAD_L0 ||
462             last == UNI_iPAD_L1 ||
463             last == UNI_iPAD_L2 ||
464             last == UNI_iPAD_L3 ||
465             last == UNI_iPAD_L4 ||
466             last == UNI_iPAD_U7)
467             pop_back();
468         else
469             break;
470       }
471 }
472 //-----------------------------------------------------------------------------
473 void
remove_trailing_whitespaces()474 UCS_string::remove_trailing_whitespaces()
475 {
476    while (size() && back() <= UNI_ASCII_SPACE)   pop_back();
477 }
478 //-----------------------------------------------------------------------------
479 void
remove_leading_whitespaces()480 UCS_string::remove_leading_whitespaces()
481 {
482 int count = 0;
483    loop(s, size())
484       {
485         if (at(s) <= UNI_ASCII_SPACE)   ++count;
486         else                            break;
487       }
488 
489    if (count == 0)        return;      // no leading whitspaces
490    if (count == size())   clear();     // only whitespaces
491    else                   basic_string::erase(0, count);
492 }
493 //-----------------------------------------------------------------------------
494 void
split_ws(UCS_string & rest)495 UCS_string::split_ws(UCS_string & rest)
496 {
497    remove_leading_and_trailing_whitespaces();
498 
499    loop(clen, size())
500        {
501          if (Avec::is_white(at(clen)))   // whilespace: end of command
502             {
503               ShapeItem arg = clen;
504               while (arg < size() && Avec::is_white(at(arg)))   ++arg;
505               while (arg < size())   rest.append(at(arg++));
506               resize(clen);
507               return;
508             }
509        }
510 }
511 //-----------------------------------------------------------------------------
512 void
copy_black(UCS_string & dest,int & idx) const513 UCS_string::copy_black(UCS_string & dest, int & idx) const
514 {
515    while (idx < size() && at(idx) <= ' ')   ++idx;
516    while (idx < size() && at(idx) >  ' ')   dest.append(at(idx++));
517    while (idx < size() && at(idx) <= ' ')   ++idx;
518 }
519 //-----------------------------------------------------------------------------
520 ShapeItem
LF_count() const521 UCS_string::LF_count() const
522 {
523 ShapeItem count = 0;
524    loop(u, size())   if (at(u) == UNI_ASCII_LF)   ++count;
525    return count;
526 }
527 //-----------------------------------------------------------------------------
528 ShapeItem
substr_pos(const UCS_string & sub) const529 UCS_string::substr_pos(const UCS_string & sub) const
530 {
531 const ShapeItem start_positions = 1 + size() - sub.size();
532    loop(start, start_positions)
533       {
534         bool mismatch = false;
535         loop(u, sub.size())
536            {
537              if (at(start + u) != sub[u])
538                 {
539                   mismatch = true;
540                   break;
541                 }
542            }
543 
544         if (!mismatch)   return start;   // found sub at start
545       }
546 
547    return -1;   // not found
548 }
549 //-----------------------------------------------------------------------------
550 bool
has_black() const551 UCS_string::has_black() const
552 {
553    loop(s, size())   if (!Avec::is_white(at(s)))   return true;
554    return false;
555 }
556 //-----------------------------------------------------------------------------
557 bool
starts_with(const char * prefix) const558 UCS_string::starts_with(const char * prefix) const
559 {
560    loop(s, size())
561       {
562         const char pc = *prefix++;
563         if (pc == 0)   return true;   // prefix matches this string.
564 
565         const Unicode uni = at(s);
566         if (uni != Unicode(pc))   return false;
567       }
568 
569    // strings match, but prefix is longer
570    //
571    return false;
572 }
573 //-----------------------------------------------------------------------------
574 bool
ends_with(const char * suffix) const575 UCS_string::ends_with(const char * suffix) const
576 {
577 const ShapeItem s_len = strlen(suffix);
578    if (size() < s_len)   return false;
579 
580    suffix += s_len;    // goto end of suffix
581    loop(s, s_len)   if (at(size() - s - 1) != *--suffix)   return false;
582    return true;
583 }
584 //-----------------------------------------------------------------------------
585 bool
starts_with(const UCS_string & prefix) const586 UCS_string::starts_with(const UCS_string & prefix) const
587 {
588    if (prefix.size() > size())   return false;
589 
590    loop(p, prefix.size())   if (at(p) != prefix[p])   return false;
591 
592    return true;
593 }
594 //-----------------------------------------------------------------------------
595 bool
starts_iwith(const char * prefix) const596 UCS_string::starts_iwith(const char * prefix) const
597 {
598    loop(s, size())
599       {
600         char pc = *prefix++;
601         if (pc == 0)   return true;   // prefix matches this string.
602         if (pc >= 'a' && pc <= 'z')   pc -= 'a' - 'A';
603 
604         int uni = at(s);
605         if (uni >= 'a' && uni <= 'z')   uni -= 'a' - 'A';
606 
607         if (uni != Unicode(pc))   return false;
608       }
609 
610    return *prefix == 0;
611 }
612 //-----------------------------------------------------------------------------
613 bool
starts_iwith(const UCS_string & prefix) const614 UCS_string::starts_iwith(const UCS_string & prefix) const
615 {
616    if (prefix.size() > size())   return false;
617 
618    loop(p, prefix.size())
619       {
620         int c1 = at(p);
621         int c2 = prefix[p];
622         if (c1 >= 'a' && c1 <= 'z')   c1 -= 'a' - 'A';
623         if (c2 >= 'a' && c2 <= 'z')   c2 -= 'a' - 'A';
624         if (c1 != c2)   return false;
625       }
626 
627    return true;
628 }
629 //-----------------------------------------------------------------------------
630 UCS_string
no_pad() const631 UCS_string::no_pad() const
632 {
633 UCS_string ret;
634    loop(s, size())
635       {
636         Unicode uni = at(s);
637         if (is_iPAD_char(uni))   uni = UNI_ASCII_SPACE;
638         ret.append(uni);
639       }
640 
641    return ret;
642 }
643 //-----------------------------------------------------------------------------
644 void
map_pad()645 UCS_string::map_pad()
646 {
647    loop(s, size())
648       {
649         if (is_iPAD_char(at(s)))   at(s) = UNI_ASCII_SPACE;
650       }
651 }
652 //-----------------------------------------------------------------------------
653 UCS_string
remove_pad() const654 UCS_string::remove_pad() const
655 {
656 UCS_string ret;
657    loop(s, size())
658       {
659         Unicode uni = at(s);
660         if (!is_iPAD_char(uni))   ret.append(uni);
661       }
662 
663    return ret;
664 }
665 //-----------------------------------------------------------------------------
666 UCS_string
reverse() const667 UCS_string::reverse() const
668 {
669 UCS_string ret;
670    for (int s = size(); s > 0;)   ret.append(at(--s));
671    return ret;
672 }
673 //-----------------------------------------------------------------------------
674 bool
is_comment_or_label() const675 UCS_string::is_comment_or_label() const
676 {
677    if (size() == 0)                          return false;
678    if (at(0) == UNI_ASCII_NUMBER_SIGN)       return true;   // comment
679    if (at(0) == UNI_COMMENT)                 return true;   // comment
680    loop(t, size())
681        {
682          if (at(t) == UNI_ASCII_COLON)       return true;   // label
683          if (!Avec::is_symbol_char(at(t)))   return false;
684        }
685 
686    return false;
687 }
688 //-----------------------------------------------------------------------------
689 ShapeItem
double_quote_count(bool in_quote2) const690 UCS_string::double_quote_count(bool in_quote2) const
691 {
692 ShapeItem count = 0;
693 bool in_quote1 = false;
694    loop(s, size())
695        {
696         const Unicode uni = at(s);
697         switch(uni)
698            {
699              case UNI_SINGLE_QUOTE:
700                   if (!in_quote2)   in_quote1 = ! in_quote1;
701                   break;
702 
703              case UNI_ASCII_DOUBLE_QUOTE:
704                   if (!in_quote1)
705                      {
706                        ++count;
707                        in_quote2 = ! in_quote2;
708                      }
709                   break;
710 
711              case UNI_ASCII_BACKSLASH:
712                   if (in_quote2)    ++s;   // ignore next char inside ""
713                   break;
714 
715              case UNI_ASCII_NUMBER_SIGN:
716              case UNI_COMMENT:
717                   if (!(in_quote1 || in_quote2))   return count;
718 
719              default:                            ;
720            }
721        }
722 
723    return count;
724 }
725 //-----------------------------------------------------------------------------
726 ShapeItem
double_quote_first() const727 UCS_string::double_quote_first() const
728 {
729 bool in_quote1 = false;
730 bool in_quote2 = true;
731    loop(s, size())
732        {
733         const Unicode uni = at(s);
734         switch(uni)
735            {
736              case UNI_SINGLE_QUOTE:
737                   if (!in_quote2)   in_quote1 = ! in_quote1;
738                   break;
739 
740              case UNI_ASCII_DOUBLE_QUOTE:
741                   if (!in_quote1)   return s;
742                   break;
743 
744              case UNI_ASCII_BACKSLASH:
745                   if (in_quote2)    ++s;   // ignore next char inside ""
746                   break;
747 
748              case UNI_ASCII_NUMBER_SIGN:
749              case UNI_COMMENT:
750                   if (in_quote1 || in_quote2)   ; // ignore # and ⍝ in atrings
751                   else                          s = size();
752                   break;
753 
754              default:                            ;
755            }
756        }
757 
758 
759    return -1;   // no un-commented and un-escaped " found
760 }
761 //-----------------------------------------------------------------------------
762 ShapeItem
double_quote_last() const763 UCS_string::double_quote_last() const
764 {
765 ShapeItem ret = -1;
766 bool in_quote1 = false;
767 bool in_quote2 = false;
768    loop(s, size())
769        {
770         const Unicode uni = at(s);
771         switch(uni)
772            {
773              case UNI_SINGLE_QUOTE:
774                   if (!in_quote2)   in_quote1 = ! in_quote1;
775                   break;
776 
777              case UNI_ASCII_DOUBLE_QUOTE:
778                   if (!in_quote1)   ret = s;
779                   break;
780 
781              case UNI_ASCII_BACKSLASH:
782                   if (in_quote2)    ++s;   // ignotr next char inside ""
783                   break;
784 
785              case UNI_ASCII_NUMBER_SIGN:
786              case UNI_COMMENT:
787                   if (in_quote1 || in_quote2)   ; // ignore # and ⍝ in atrings
788                   else                          s = size();
789                   break;
790 
791              default:                            ;
792            }
793        }
794 
795    return ret;
796 }
797 //-----------------------------------------------------------------------------
798 void
append_UTF8(const UTF8 * str)799 UCS_string::append_UTF8(const UTF8 * str)
800 {
801 const size_t len = strlen(charP(str));
802 const UTF8_string utf(str, len);
803 const UCS_string ucs(utf);
804 
805    append(ucs);
806 }
807 //-----------------------------------------------------------------------------
808 void
append_quoted(const UCS_string & other)809 UCS_string::append_quoted(const UCS_string & other)
810 {
811    append(UNI_ASCII_DOUBLE_QUOTE);
812    loop(s, other.size())
813        {
814           const Unicode uni = other[s];
815           if (uni == UNI_ASCII_DOUBLE_QUOTE)   append(UNI_ASCII_BACKSLASH);
816           append(uni);
817        }
818    append(UNI_ASCII_DOUBLE_QUOTE);
819 }
820 //-----------------------------------------------------------------------------
821 void
append_number(ShapeItem num)822 UCS_string::append_number(ShapeItem num)
823 {
824 char cc[40];
825    snprintf(cc, sizeof(cc) - 1, "%lld", long_long(num));
826    loop(c, sizeof(cc))
827       {
828         if (cc[c])   append(Unicode(cc[c]));
829         else         break;
830       }
831 }
832 //-----------------------------------------------------------------------------
833 void
append_hex(ShapeItem num,bool uppercase)834 UCS_string::append_hex(ShapeItem num, bool uppercase)
835 {
836 const char * format = uppercase ? "%llX" : "%llx";
837 char cc[40];
838    snprintf(cc, sizeof(cc) - 1, format, long_long(num));
839    loop(c, sizeof(cc))
840       {
841         if (cc[c])   append(Unicode(cc[c]));
842         else         break;
843       }
844 }
845 //-----------------------------------------------------------------------------
846 void
append_shape(const Shape & shape)847 UCS_string::append_shape(const Shape & shape)
848 {
849    loop(r, shape.get_rank())
850        {
851          if (r)   append(UNI_ASCII_SPACE);
852          ShapeItem s = shape.get_shape_item(r);
853          if (s < 0)
854             {
855               s = -s;
856               append(UNI_OVERBAR);
857             }
858          append_number(s);
859        }
860 }
861 //-----------------------------------------------------------------------------
862 void
append_float(APL_Float num)863 UCS_string::append_float(APL_Float num)
864 {
865 char cc[60];
866    snprintf(cc, sizeof(cc) - 1, "%lf", double(num));
867    loop(c, sizeof(cc))
868       {
869         if (cc[c])   append(Unicode(cc[c]));
870         else         break;
871       }
872 }
873 //-----------------------------------------------------------------------------
874 UCS_string
un_escape(bool double_quoted,bool keep_LF) const875 UCS_string::un_escape(bool double_quoted, bool keep_LF) const
876 {
877 const char * hex = "0123456789abcdef";
878 UCS_string ret;
879    ret.reserve(size());
880 
881    if (double_quoted)
882       {
883         loop(s, size())
884             {
885              const Unicode uni = at(s);
886              if (uni != UNI_ASCII_BACKSLASH)   // normal char
887                 {
888                   ret.append(uni);
889                   continue;
890                 }
891 
892              if (s >= (size() - 1))   // \ at end of string
893                 {
894                   ret.append(UNI_ASCII_BACKSLASH);
895                   break;
896                 }
897 
898              const Unicode uni1 = at(++s);
899              switch(uni1)
900                  {
901                   case UNI_ASCII_a:            ret << UNI_ASCII_BEL;   continue;
902                   case UNI_ASCII_b:            ret << UNI_ASCII_BS;    continue;
903                   case UNI_ASCII_f:            ret << UNI_ASCII_FF;    continue;
904                   case UNI_ASCII_n:            if (keep_LF)   break;
905                                                ret << UNI_ASCII_LF;    continue;
906                   case UNI_ASCII_r:            ret << UNI_ASCII_CR;    continue;
907                   case UNI_ASCII_t:            ret << UNI_ASCII_BS;    continue;
908                   case UNI_ASCII_v:            ret << UNI_ASCII_VT;    continue;
909                   case UNI_ASCII_DOUBLE_QUOTE:
910                   case UNI_ASCII_BACKSLASH:
911                                                ret << uni1;            continue;
912                   default:                     break;
913                  }
914 
915              int max_len = 0;
916              if (uni1 == UNI_ASCII_u)
917                 {
918                   max_len = 4;
919                 }
920              else if (uni1 == UNI_ASCII_x)
921                 {
922                   max_len = 2;
923                 }
924              else   // \n or \": keep them escaped
925                 {
926                   ret.append(uni);
927                   ret.append(uni1);
928                   continue;
929                 }
930 
931                // \x or \u
932                //
933                int value = 0;
934                loop(m, max_len)
935                    {
936                      if (s >= (size() - 1))   break;
937                      const int dig = at(s+1);
938                      const char * pos = strchr(hex, dig);
939                      if (pos == 0)   break;   // non-hex character
940 
941                      value = value << 4 | (pos - hex);
942                      ++s;
943                    }
944                ret.append(Unicode(value));
945             }
946       }
947    else
948       {
949         bool got_quote = false;
950         loop(s, size())
951            {
952              const Unicode uni = at(s);
953              if (uni == UNI_SINGLE_QUOTE)
954                 {
955                   if (got_quote)
956                      {
957                         ret.append(UNI_SINGLE_QUOTE);
958                         got_quote = false;
959                      }
960                   else
961                      {
962                         ret.append(UNI_SINGLE_QUOTE);
963                         got_quote = true;
964                      }
965                 }
966              else
967                 {
968                   if (got_quote)   ret.append(UNI_SINGLE_QUOTE);   // mal-formed
969                   ret.append(uni);
970                   got_quote = false;
971                 }
972            }
973       }
974 
975    return ret;
976 }
977 //-----------------------------------------------------------------------------
978 UCS_string
do_escape(bool double_quoted) const979 UCS_string::do_escape(bool double_quoted) const
980 {
981 const char * hex = "0123456789abcdef";
982 UCS_string ret;
983    ret.reserve(size());
984 
985    if (double_quoted)
986       {
987         loop(s, size())
988            {
989              const Unicode uni = at(s);
990              switch(uni)
991                 {
992                   case UNI_ASCII_BEL:            ret << "\\a";    continue;
993                   case UNI_ASCII_BS:             ret << "\\b";    continue;
994                   case UNI_ASCII_HT:             ret << "\\t";    continue;
995                   case UNI_ASCII_LF:             ret << "\\n";    continue;
996                   case UNI_ASCII_VT:             ret << "\\v";    continue;
997                   case UNI_ASCII_FF:             ret << "\\f";    continue;
998                   case UNI_ASCII_CR:             ret << "\\r";    continue;
999                   case UNI_ASCII_DOUBLE_QUOTE:   ret << "\\\"";   continue;
1000                   case UNI_ASCII_BACKSLASH:      ret << "\\\\";   continue;
1001                   default:                       break;
1002                 }
1003 
1004              // none of the above
1005              //
1006              if (uni >= UNI_ASCII_SPACE && uni < UNI_ASCII_DELETE)
1007                 {
1008                   ret.append(uni);
1009                   continue;
1010                 }
1011 
1012              if (uni <= 0x0F)   // small ASCII
1013                 {
1014                   ret << "\\x0";
1015                   ret << Unicode(hex[uni]);
1016                 }
1017              else if (uni <= 0xFF)   // other ASCII
1018                 {
1019                   ret << "\\x";
1020                   ret << Unicode(hex[uni >> 4 & 0x0F]);
1021                   ret << Unicode(hex[uni      & 0x0F]);
1022                 }
1023              else
1024                 {
1025                   ret.append(uni);
1026                 }
1027            }
1028       }
1029    else   // single-quoted
1030       {
1031         loop(s, size())
1032            {
1033              const Unicode uni = at(s);
1034              ret.append(uni);
1035              if (uni == UNI_SINGLE_QUOTE)   ret.append(uni);   // another '
1036            }
1037       }
1038 
1039    return ret;
1040 }
1041 //-----------------------------------------------------------------------------
1042 size_t
to_vector(UCS_string_vector & result) const1043 UCS_string::to_vector(UCS_string_vector & result) const
1044 {
1045 size_t max_len = 0;
1046 
1047    result.clear();
1048    if (size() == 0)   return max_len;
1049 
1050    result.push_back(UCS_string());
1051    loop(s, size())
1052       {
1053         const Unicode uni = at(s);
1054         if (uni == UNI_ASCII_LF)    // line done
1055            {
1056              const size_t len = result.back().size();
1057              if (max_len < len)   max_len = len;
1058 
1059              if (s < size() - 1)   // more coming
1060                 result.push_back(UCS_string());
1061            }
1062         else
1063            {
1064              if (uni != UNI_ASCII_CR)         // ignore \r.
1065                 result.back().append(uni);
1066            }
1067       }
1068 
1069    // if the last line lacked a \n we check max_len here again.
1070 const size_t len = result.back().size();
1071    if (max_len < len)   max_len = len;
1072 
1073    return max_len;
1074 }
1075 //-----------------------------------------------------------------------------
1076 int
atoi() const1077 UCS_string::atoi() const
1078 {
1079 int ret = 0;
1080 bool negative = false;
1081 
1082    loop(s, size())
1083       {
1084         const Unicode uni = at(s);
1085 
1086         if (!ret && Avec::is_white(uni))   continue;   // leading whitespace
1087 
1088         if (uni == UNI_ASCII_MINUS || uni == UNI_OVERBAR)
1089            {
1090              negative = true;
1091              continue;
1092            }
1093 
1094         if (uni < UNI_ASCII_0)                break;      // non-digit
1095         if (uni > UNI_ASCII_9)                break;      // non-digit
1096 
1097         ret *= 10;
1098         ret += uni - UNI_ASCII_0;
1099       }
1100 
1101    return negative ? -ret : ret;
1102 }
1103 //-----------------------------------------------------------------------------
1104 ostream &
operator <<(ostream & os,Unicode uni)1105 operator << (ostream & os, Unicode uni)
1106 {
1107    if (uni < 0x80)      return os << char(uni);
1108 
1109    if (uni < 0x800)     return os << char(0xC0 | (uni >> 6))
1110                                   << char(0x80 | (uni & 0x3F));
1111 
1112    if (uni < 0x10000)    return os << char(0xE0 | (uni >> 12))
1113                                    << char(0x80 | (uni >>  6 & 0x3F))
1114                                    << char(0x80 | (uni       & 0x3F));
1115 
1116    if (uni < 0x200000)   return os << char(0xF0 | (uni >> 18))
1117                                    << char(0x80 | (uni >> 12 & 0x3F))
1118                                    << char(0x80 | (uni >>  6 & 0x3F))
1119                                    << char(0x80 | (uni       & 0x3F));
1120 
1121    if (uni < 0x4000000)  return os << char(0xF8 | (uni >> 24))
1122                                    << char(0x80 | (uni >> 18 & 0x3F))
1123                                    << char(0x80 | (uni >> 12 & 0x3F))
1124                                    << char(0x80 | (uni >>  6 & 0x3F))
1125                                    << char(0x80 | (uni       & 0x3F));
1126 
1127    return os << char(0xFC | (uni >> 30))
1128              << char(0x80 | (uni >> 24 & 0x3F))
1129              << char(0x80 | (uni >> 18 & 0x3F))
1130              << char(0x80 | (uni >> 12 & 0x3F))
1131              << char(0x80 | (uni >>  6 & 0x3F))
1132              << char(0x80 | (uni       & 0x3F));
1133 }
1134 //-----------------------------------------------------------------------------
1135 ostream &
operator <<(ostream & os,const UCS_string & ucs)1136 operator << (ostream & os, const UCS_string & ucs)
1137 {
1138 const int fill_len = os.width() - ucs.size();
1139 
1140    if (fill_len > 0)
1141       {
1142         os.width(0);
1143         loop(u, ucs.size())   os << ucs[u];
1144         loop(f, fill_len)     os << os.fill();
1145       }
1146    else
1147       {
1148         loop(u, ucs.size())   os << ucs[u];
1149       }
1150 
1151    return os;
1152 }
1153 //-----------------------------------------------------------------------------
1154 bool
lexical_before(const UCS_string other) const1155 UCS_string::lexical_before(const UCS_string other) const
1156 {
1157    loop(u, size())
1158       {
1159         if (u >= other.size())   return false;   // other is a prefix of this
1160         if (at(u) < other.at(u))   return true;
1161         if (at(u) > other.at(u))   return false;
1162       }
1163 
1164    // at this point the common part of this and other is equal, If other
1165    // is longer then this is a prefix of other (and this comes before other)
1166    return other.size() > size();
1167 }
1168 //-----------------------------------------------------------------------------
1169 ostream &
dump(ostream & out) const1170 UCS_string::dump(ostream & out) const
1171 {
1172    out << right << hex << uppercase << setfill('0');
1173    loop(s, size())
1174       {
1175         out << " U+" << setw(4) << int(at(s));
1176       }
1177 
1178    return out << left << dec << nouppercase << setfill(' ');
1179 }
1180 //-----------------------------------------------------------------------------
1181 UCS_string
from_int(int64_t value)1182 UCS_string::from_int(int64_t value)
1183 {
1184    if (value >= 0)   return from_uint(value);
1185 
1186 UCS_string ret(UNI_OVERBAR);
1187    return ret + from_uint(- value);
1188 }
1189 //-----------------------------------------------------------------------------
1190 UCS_string
from_uint(uint64_t value)1191 UCS_string::from_uint(uint64_t value)
1192 {
1193    if (value == 0)   return UCS_string("0");
1194 
1195 int digits[40];
1196 int * d = digits;
1197 
1198    while (value)
1199       {
1200         const uint64_t v_10 = value / 10;
1201         *d++ = value - 10*v_10;
1202         value = v_10;
1203       }
1204 
1205 UCS_string ret;
1206    while (d > digits)   ret.append(Unicode(UNI_ASCII_0 + *--d));
1207    return ret;
1208 }
1209 //-----------------------------------------------------------------------------
1210 UCS_string
from_big(APL_Float & val)1211 UCS_string::from_big(APL_Float & val)
1212 {
1213    Assert(val >= 0.0);
1214 
1215 long double value = val;
1216 int digits[320];   // DBL_MAX is 1.79769313486231470E+308
1217 int * d = digits;
1218 
1219 const long double initial_fract = modf(value, &value);
1220 long double fract;
1221    for (; value >= 1.0; ++d)
1222       {
1223          fract = modf(value / 10.0, &value);   // U.x -> .U
1224          *d = int((fract + .02) * 10.0);
1225          fract -= 0.1 * *d;
1226       }
1227 
1228    val = initial_fract;
1229 
1230 UCS_string ret;
1231    if (d == digits)   ret.append(UNI_ASCII_0);   // 0.xxx
1232 
1233    while (d > digits)   ret.append(Unicode(UNI_ASCII_0 + *--d));
1234    return ret;
1235 }
1236 //-----------------------------------------------------------------------------
1237 UCS_string
from_double_expo_prec(APL_Float v,int fract_digits)1238 UCS_string::from_double_expo_prec(APL_Float v, int fract_digits)
1239 {
1240 UCS_string ret;
1241 
1242    if (v == 0.0)
1243       {
1244         ret.append(UNI_ASCII_0);
1245         if (fract_digits)   // unless integer only
1246            {
1247              ret.append(UNI_ASCII_FULLSTOP);
1248              loop(f, fract_digits)   ret.append(UNI_ASCII_0);
1249            }
1250         ret.append(UNI_ASCII_E);
1251         ret.append(UNI_ASCII_0);
1252         return ret;
1253       }
1254 
1255    if (v < 0.0)   { ret.append(UNI_OVERBAR);   v = - v; }
1256 
1257 int expo = 0;
1258    while (v >= 1.0E1)
1259       {
1260         if (v >= 1.0E9)
1261            if (v >= 1.0E13)
1262               if (v >= 1.0E15)
1263                  if     (v >= 1.0E16)      { v = v * 1.0E-16;   expo += 16; }
1264                  else /* v >= 1.0E15 */    { v = v * 1.0E-15;   expo += 15; }
1265               else
1266                  if     (v >= 1.0E14)      { v = v * 1.0E-14;   expo += 14; }
1267                  else /* v >= 1.0E13 */    { v = v * 1.0E-13;   expo += 13; }
1268            else
1269               if (v >= 1.0E11)
1270                  if     (v >= 1.0E12)      { v = v * 1.0E-12;   expo += 12; }
1271                  else /* v >= 1.0E11 */    { v = v * 1.0E-11;   expo += 11; }
1272               else
1273                  if     (v >= 1.0E10)      { v = v * 1.0E-10;   expo += 10; }
1274                  else /* v >= 1.0E9 */     { v = v * 1.0E-9;    expo += 9;  }
1275         else
1276            if (v >= 1.0E5)
1277               if (v >= 1.0E7)
1278                  if     (v >= 1.0E8)       { v = v * 1.0E-8;    expo += 8;  }
1279                  else /* v >= 1.0E7 */     { v = v * 1.0E-7;    expo += 7;  }
1280               else
1281                  if     (v >= 1.0E6)       { v = v * 1.0E-6;    expo += 6;  }
1282                  else /* v >= 1.0E5 */     { v = v * 1.0E-5;    expo += 5;  }
1283            else
1284               if (v >= 1.0E3)
1285                  if     (v >= 1.0E4)       { v = v * 1.0E-4;    expo += 4;  }
1286                  else /* v >= 1.0E3 */     { v = v * 1.0E-3;    expo += 3;  }
1287               else
1288                  if     (v >= 1.0E2)       { v = v * 1.0E-2;    expo += 2;  }
1289                  else /* v >= 1.0E1 */     { v = v * 1.0E-1;    expo += 1;  }
1290       }
1291 
1292    while (v < 1.0E0)
1293       {
1294         if (v < 1.0E-8)
1295            if (v < 1.0E-12)
1296               if (v < 1.0E-14)
1297                  if     (v < 1.0E-15)      { v = v * 1.0E-16;   expo += 16; }
1298                  else /* v < 1.0E-14 */    { v = v * 1.0E-15;   expo += 15; }
1299               else
1300                  if     (v < 1.0E-13)      { v = v * 1.0E-14;   expo += 14; }
1301                  else /* v < 1.0E-12 */    { v = v * 1.0E-13;   expo += 13; }
1302            else
1303               if (v < 1.0E-10)
1304                  if     (v < 1.0E-11)      { v = v * 1.0E12;   expo += -12; }
1305                  else /* v < 1.0E-10 */    { v = v * 1.0E11;   expo += -11; }
1306               else
1307                  if     (v < 1.0E-9 )      { v = v * 1.0E10;   expo += -10; }
1308                  else /* v < 1.0E-8 */     { v = v * 1.0E9;    expo += -9;  }
1309         else
1310            if (v < 1.0E-4)
1311               if (v < 1.0E-6)
1312                  if     (v < 1.0E-7)       { v = v * 1.0E8;    expo += -8;  }
1313                  else /* v < 1.0E-6 */     { v = v * 1.0E7;    expo += -7;  }
1314               else
1315                  if     (v < 1.0E-5)       { v = v * 1.0E6;    expo += -6;  }
1316                  else /* v < 1.0E-4 */     { v = v * 1.0E5;    expo += -5;  }
1317            else
1318               if (v < 1.0E-2)
1319                  if     (v < 1.0E-3)       { v = v * 1.0E4;    expo += -4;  }
1320                  else /* v < 1.0E-2 */     { v = v * 1.0E3;    expo += -3;  }
1321               else
1322                  if     (v < 1.0E-1)       { v = v * 1.0E2;    expo += -2;  }
1323                  else /* v < 1.0E0  */     { v = v * 1.0E1;    expo += -1;  }
1324       }
1325 
1326    Assert(v >= 1.0);
1327    Assert(v < 10.0);
1328 
1329    // print mantissa in fixed format
1330    //
1331 UCS_string mantissa = from_double_fixed_prec(v, fract_digits);
1332    if (mantissa.size() > 2 &&
1333        mantissa[0] == UNI_ASCII_1 &&
1334        mantissa[1] == UNI_ASCII_0 &&
1335        mantissa[2] == UNI_ASCII_FULLSTOP)   // 9.xxx rounded up to 10.xxx
1336       {
1337         mantissa[1] = UNI_ASCII_FULLSTOP;
1338         mantissa[2] = UNI_ASCII_0;
1339        ++expo;
1340       }
1341 
1342    ret.append(mantissa);
1343    ret.append(UNI_ASCII_E);
1344    ret.append(from_int(expo));
1345 
1346    return ret;
1347 }
1348 //-----------------------------------------------------------------------------
1349 UCS_string
from_double_fixed_prec(APL_Float v,int fract_digits)1350 UCS_string::from_double_fixed_prec(APL_Float v, int fract_digits)
1351 {
1352 UCS_string ret;
1353 
1354    if (v < 0.0)   { ret.append(UNI_OVERBAR);   v = - v; }
1355 
1356    // in the loop below, there could be rounding errors when casting float
1357    // to int. We therefore increase v slighly (by 0.3 of the rounded digit)
1358    // to avoid that.
1359    //
1360    v += 0.03 * pow(10.0, -fract_digits);
1361 
1362    ret.append(from_big(v));   // leaves fractional part of v in v
1363 
1364    ret.append(UNI_ASCII_FULLSTOP);
1365 
1366    loop(f, fract_digits + 1)
1367       {
1368         v = v * 10.0;
1369         const int vv = v;   // subject to rounding errors!
1370         ret.append(Unicode(UNI_ASCII_0 + vv));
1371         v -= vv;
1372       }
1373 
1374    ret.round_last_digit();
1375    return ret;
1376 }
1377 //-----------------------------------------------------------------------------
1378 void
round_last_digit()1379 UCS_string::round_last_digit()
1380 {
1381    Assert1(size() > 1);
1382    if (back() >= UNI_ASCII_5)   // round up
1383       {
1384         for (int q = size() - 2; q >= 0; --q)
1385             {
1386               const Unicode cc = at(q);
1387               if (cc < UNI_ASCII_0)   continue;   // not a digit
1388               if (cc > UNI_ASCII_9)   continue;   // not a digit
1389 
1390               at(q) = Unicode(cc + 1);   // round up
1391               if (cc != UNI_ASCII_9)   break;    // 0-8 rounded up: stop
1392 
1393               at(q) = UNI_ASCII_0;    // 9 rounded up: say 0 and repeat
1394               if (q)   continue;   // not first difit
1395 
1396               // something like 9.xxx has been rounded up to, say, 0.xxx
1397               // but should be 10.xxx Fix it.
1398               //
1399               for (int d = size() - 1; d > 0; --d)  at(d) = at(d - 1);
1400               at(0) = UNI_ASCII_1;
1401             }
1402       }
1403 
1404    pop_back();
1405    if (back() == UNI_ASCII_FULLSTOP)   pop_back();
1406 }
1407 //----------------------------------------------------------------------------
1408 bool
contains(Unicode uni)1409 UCS_string::contains(Unicode uni)
1410 {
1411    loop(u, size())   if (at(u) == uni)   return true;
1412    return false;
1413 }
1414 //----------------------------------------------------------------------------
1415 UCS_string
sort() const1416 UCS_string::sort() const
1417 {
1418 UCS_string ret(*this);
1419    Heapsort<Unicode>::sort(&ret[0], ret.size(), 0, greater_uni);
1420    return ret;
1421 }
1422 //----------------------------------------------------------------------------
1423 UCS_string
unique() const1424 UCS_string::unique() const
1425 {
1426    if (size() <= 1)   return UCS_string(*this);
1427 
1428 UCS_string sorted = sort();
1429 UCS_string ret;
1430    ret.reserve(sorted.size());
1431 
1432    ret.append(sorted[0]);
1433    for (ShapeItem j = 1; j < size(); ++j)
1434        {
1435          if (sorted[j] != ret.back())   ret.append(sorted[j]);
1436        }
1437 
1438    Heapsort<Unicode>::sort(&ret[0], ret.size(), 0, greater_uni);
1439    return ret;
1440 }
1441 //----------------------------------------------------------------------------
1442 UCS_string
to_HTML(int offset,bool preserve_ws) const1443 UCS_string::to_HTML(int offset, bool preserve_ws) const
1444 {
1445 UCS_string ret;
1446    for (;offset < size(); ++offset)
1447       {
1448         const Unicode uni = at(offset);
1449         switch(uni)
1450            {
1451              case ' ':  if (preserve_ws)   ret.append_ASCII("&nbsp;");
1452                         else               ret.append(uni);
1453                         break;
1454              case '#':  ret.append_ASCII("&#35;");   break;
1455              case '%':  ret.append_ASCII("&#37;");   break;
1456              case '&':  ret.append_ASCII("&#38;");   break;
1457              case '<':  ret.append_ASCII("&lt;");    break;
1458              case '>':  ret.append_ASCII("&gt;");    break;
1459              default:   ret.append(uni);
1460            }
1461       }
1462 
1463    return ret;
1464 }
1465 //----------------------------------------------------------------------------
1466 #if UCS_tracking
~UCS_string()1467 UCS_string::~UCS_string()
1468 {
1469    --total_count;
1470    cerr << setfill('0') << endl << "@@ " << setw(5) << instance_id
1471         << " DEL ##" << total_count
1472         << " c= " << Backtrace::caller(3) << setfill(' ') << endl;
1473 }
1474 //----------------------------------------------------------------------------
create(const char * loc)1475 void UCS_string::create(const char * loc)
1476 {
1477    ++total_count;
1478    instance_id = ++total_id;
1479    cerr << setfill('0') << endl << "@@ " << setw(5) << instance_id
1480         << " NEW ##" << total_count << " " << loc
1481         << " c= " << Backtrace::caller(3) << setfill(' ') << endl;
1482 }
1483 
1484 #endif
1485 //----------------------------------------------------------------------------
1486