1 /*
2     This file is part of GNU APL, a free implementation of the
3     ISO/IEC Standard 13751, "Programming Language APL, Extended"
4 
5     Copyright (C) 2008-2019  Dr. Jürgen Sauermann
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #ifndef __UTF8_STRING_HH_DEFINED__
22 #define __UTF8_STRING_HH_DEFINED__
23 
24 #include <iostream>
25 #include <stdint.h>
26 #include <string>
27 
28 #include "Common.hh"
29 
30 using namespace std;
31 
32 class UCS_string;
33 class Value;
34 
35 //-----------------------------------------------------------------------------
36 /// one byte of a UTF8 encoded Unicode (RFC 3629) string
37 typedef uint8_t UTF8;
38 
39 //-----------------------------------------------------------------------------
40 /// frequently used cast to const UTF8 *
41 inline const UTF8 *
utf8P(const void * vp)42 utf8P(const void * vp)
43 {
44   return reinterpret_cast<const UTF8 *>(vp);
45 }
46 //-----------------------------------------------------------------------------
47 /// an UTF8 encoded Unicode (RFC 3629) string
48 class UTF8_string : public std::basic_string<UTF8>
49 {
50 public:
51    /// constructor: empty UTF8_string
UTF8_string()52    UTF8_string()
53    {}
54 
55    /// constructor: UTF8_string from 0-terminated C string.
UTF8_string(const char * str)56    UTF8_string(const char * str)
57       { while (*str)   *this += *str++; }
58 
59    /// constructor: copy of C string, but at most len bytes
UTF8_string(const UTF8 * str,size_t len)60    UTF8_string(const UTF8 * str, size_t len)
61       {
62         loop(l, len)
63             if (*str)   *this += *str++;
64             else        break;
65       }
66 
67    /// constructor: copy of UCS string. The UCS characters will be UTF8-encoded
68    UTF8_string(const UCS_string & ucs);
69 
70    /// constructor: UCS_string from (simple character vector) APL value.
71    /// Non-ASCII characters will be UTF8 encoded.
72    UTF8_string(const Value & value);
73 
74    /// return true iff \b this is equal to \b other
operator ==(const UTF8_string & other) const75    bool operator ==(const UTF8_string & other) const
76       {
77         if (size() != other.size())   return false;
78         loop(c, size())   if (at(c) != other.at(c))   return false;
79         return true;
80       }
81 
82    /// return \b this string as a 0-termionated C string
c_str() const83    const char * c_str() const
84       { return reinterpret_cast<const char *>
85                                (std::basic_string<UTF8>::c_str()); }
86 
87    /// prevent basic_string::erase() with its dangerous default value for
88    /// the number of erased character.
erase(size_t pos)89    void erase(size_t pos)
90       { basic_string::erase(pos, 1); }
91 
92    /// return the last byte in this string
back() const93    UTF8 back() const
94       { Assert(size());   return at(size() - 1); }
95 
96    /// discard the last byte in this string
pop_back()97    void pop_back()
98       { Assert(size());   resize(size() - 1); }
99 
100    /// append a 0-terminated C string
append_ASCII(const char * ascii)101    void append_ASCII(const char * ascii)
102       { while (*ascii)   *this += *ascii++; }
103 
104    /// append the UTF8_string \b suffix
append_UTF8(const UTF8_string & suffix)105    void append_UTF8(const UTF8_string & suffix)
106       { loop(s, suffix.size())   *this += suffix[s]; }
107 
108    /// display bytes in this UTF string
109    ostream & dump_hex(ostream & out, int max_bytes) const;
110 
111    /// return true iff string ends with ext (usually a file name extennsion)
112    bool ends_with(const char * ext) const;
113 
114    /// return true iff string starts with path (usually a file path)
115    bool starts_with(const char * path) const;
116 
117    /// skip over < ... > and expand &lt; and friends
118    int un_HTML(int in_HTML);
119 
120    /// round a digit string is the fractional part of a number between
121    /// 0.0... and 0.9... up or down according to its last digit, return true
122    /// if the exponent shall be increased (because 1.0 -> 0.1)
123    bool round_0_1();
124 
125    /// convert the first char in UTF8-encoded string to Unicode,
126    /// setting len to the number of bytes in the UTF8 encoding of the char
127    static Unicode toUni(const UTF8 * string, int & len, bool verbose);
128 
129    /// return the next UTF8 encoded char from an input file
130    static Unicode getc(istream & in);
131 };
132 //=============================================================================
133 /// A UTF8 string to be used as filebuf in UTF8_ostream
134 class UTF8_filebuf : public filebuf
135 {
136 public:
137    /// return the data in this filebuf
get_data()138    const UTF8_string & get_data()
139       { return data; }
140 
141 protected:
142    /// insert \b c into this filebuf
143    virtual int overflow(int c);
144 
145    /// the data in this filebuf
146    UTF8_string data;
147 };
148 //=============================================================================
149 /// a UTF8 string that can be used as ostream
150 class UTF8_ostream : public ostream
151 {
152 public:
153    /// An UTF8_string that can be used like an ostream to format data
UTF8_ostream()154    UTF8_ostream()
155    : ostream(&utf8_filebuf)
156    {}
157 
158    /// return the data in this UTF8_string
get_data()159    const UTF8_string & get_data()
160       { return utf8_filebuf.get_data(); }
161 
162 protected:
163    /// the filebuf of this ostream
164    UTF8_filebuf utf8_filebuf;
165 };
166 //=============================================================================
167 
168 #endif // __UTF8_STRING_HH_DEFINED__
169