1 /*
2 This file is part of GNU APL, a free implementation of the
3 ISO/IEC Standard 13751, "Programming Language APL, Extended"
4
5 Copyright (C) 2008-2019 Dr. Jürgen Sauermann
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #ifndef __UTF8_STRING_HH_DEFINED__
22 #define __UTF8_STRING_HH_DEFINED__
23
24 #include <iostream>
25 #include <stdint.h>
26 #include <string>
27
28 #include "Common.hh"
29
30 using namespace std;
31
32 class UCS_string;
33 class Value;
34
35 //-----------------------------------------------------------------------------
36 /// one byte of a UTF8 encoded Unicode (RFC 3629) string
37 typedef uint8_t UTF8;
38
39 //-----------------------------------------------------------------------------
40 /// frequently used cast to const UTF8 *
41 inline const UTF8 *
utf8P(const void * vp)42 utf8P(const void * vp)
43 {
44 return reinterpret_cast<const UTF8 *>(vp);
45 }
46 //-----------------------------------------------------------------------------
47 /// an UTF8 encoded Unicode (RFC 3629) string
48 class UTF8_string : public std::basic_string<UTF8>
49 {
50 public:
51 /// constructor: empty UTF8_string
UTF8_string()52 UTF8_string()
53 {}
54
55 /// constructor: UTF8_string from 0-terminated C string.
UTF8_string(const char * str)56 UTF8_string(const char * str)
57 { while (*str) *this += *str++; }
58
59 /// constructor: copy of C string, but at most len bytes
UTF8_string(const UTF8 * str,size_t len)60 UTF8_string(const UTF8 * str, size_t len)
61 {
62 loop(l, len)
63 if (*str) *this += *str++;
64 else break;
65 }
66
67 /// constructor: copy of UCS string. The UCS characters will be UTF8-encoded
68 UTF8_string(const UCS_string & ucs);
69
70 /// constructor: UCS_string from (simple character vector) APL value.
71 /// Non-ASCII characters will be UTF8 encoded.
72 UTF8_string(const Value & value);
73
74 /// return true iff \b this is equal to \b other
operator ==(const UTF8_string & other) const75 bool operator ==(const UTF8_string & other) const
76 {
77 if (size() != other.size()) return false;
78 loop(c, size()) if (at(c) != other.at(c)) return false;
79 return true;
80 }
81
82 /// return \b this string as a 0-termionated C string
c_str() const83 const char * c_str() const
84 { return reinterpret_cast<const char *>
85 (std::basic_string<UTF8>::c_str()); }
86
87 /// prevent basic_string::erase() with its dangerous default value for
88 /// the number of erased character.
erase(size_t pos)89 void erase(size_t pos)
90 { basic_string::erase(pos, 1); }
91
92 /// return the last byte in this string
back() const93 UTF8 back() const
94 { Assert(size()); return at(size() - 1); }
95
96 /// discard the last byte in this string
pop_back()97 void pop_back()
98 { Assert(size()); resize(size() - 1); }
99
100 /// append a 0-terminated C string
append_ASCII(const char * ascii)101 void append_ASCII(const char * ascii)
102 { while (*ascii) *this += *ascii++; }
103
104 /// append the UTF8_string \b suffix
append_UTF8(const UTF8_string & suffix)105 void append_UTF8(const UTF8_string & suffix)
106 { loop(s, suffix.size()) *this += suffix[s]; }
107
108 /// display bytes in this UTF string
109 ostream & dump_hex(ostream & out, int max_bytes) const;
110
111 /// return true iff string ends with ext (usually a file name extennsion)
112 bool ends_with(const char * ext) const;
113
114 /// return true iff string starts with path (usually a file path)
115 bool starts_with(const char * path) const;
116
117 /// skip over < ... > and expand < and friends
118 int un_HTML(int in_HTML);
119
120 /// round a digit string is the fractional part of a number between
121 /// 0.0... and 0.9... up or down according to its last digit, return true
122 /// if the exponent shall be increased (because 1.0 -> 0.1)
123 bool round_0_1();
124
125 /// convert the first char in UTF8-encoded string to Unicode,
126 /// setting len to the number of bytes in the UTF8 encoding of the char
127 static Unicode toUni(const UTF8 * string, int & len, bool verbose);
128
129 /// return the next UTF8 encoded char from an input file
130 static Unicode getc(istream & in);
131 };
132 //=============================================================================
133 /// A UTF8 string to be used as filebuf in UTF8_ostream
134 class UTF8_filebuf : public filebuf
135 {
136 public:
137 /// return the data in this filebuf
get_data()138 const UTF8_string & get_data()
139 { return data; }
140
141 protected:
142 /// insert \b c into this filebuf
143 virtual int overflow(int c);
144
145 /// the data in this filebuf
146 UTF8_string data;
147 };
148 //=============================================================================
149 /// a UTF8 string that can be used as ostream
150 class UTF8_ostream : public ostream
151 {
152 public:
153 /// An UTF8_string that can be used like an ostream to format data
UTF8_ostream()154 UTF8_ostream()
155 : ostream(&utf8_filebuf)
156 {}
157
158 /// return the data in this UTF8_string
get_data()159 const UTF8_string & get_data()
160 { return utf8_filebuf.get_data(); }
161
162 protected:
163 /// the filebuf of this ostream
164 UTF8_filebuf utf8_filebuf;
165 };
166 //=============================================================================
167
168 #endif // __UTF8_STRING_HH_DEFINED__
169