1 // madronalib: a C++ framework for DSP applications.
2 // Copyright (c) 2020 Madrona Labs LLC. http://www.madronalabs.com
3 // Distributed under the MIT license: http://madrona-labs.mit-license.org/
4
5 #include "MLText.h"
6
7 #include <cstring>
8 #include <iostream>
9 #include <vector>
10
11 #include "MLMemoryUtils.h"
12 #include "utf.hpp"
13
14 namespace ml
15 {
16 // Iterator::Impl
17
18 class TextFragment::Iterator::Impl
19 {
20 friend class TextFragment::Iterator;
21 friend bool operator!=(Iterator lhs, Iterator rhs);
22 friend bool operator==(Iterator lhs, Iterator rhs);
23
24 utf::codepoint_iterator<const char*> _utf8Iter;
25
26 public:
Impl(const char * pos)27 Impl(const char* pos) : _utf8Iter(utf::codepoint_iterator<const char*>(pos)) {}
Impl(const utf::codepoint_iterator<const char * > & utf_iter)28 Impl(const utf::codepoint_iterator<const char*>& utf_iter) : _utf8Iter(utf_iter) {}
29 };
30
31 // Iterator
32
Iterator(const char * pos)33 TextFragment::Iterator::Iterator(const char* pos) { pImpl = std::unique_ptr<Impl>(new Impl(pos)); }
34
Iterator(const Iterator & it)35 TextFragment::Iterator::Iterator(const Iterator& it) // = default;
36 {
37 pImpl = std::unique_ptr<Impl>(new Impl(it.pImpl->_utf8Iter));
38 }
39
40 TextFragment::Iterator::~Iterator() = default;
41
operator *()42 CodePoint TextFragment::Iterator::operator*() { return pImpl->_utf8Iter.operator*(); }
43
44 // CodePoint operator->() { return _utf8Iter.operator->(); }
45
operator ++()46 TextFragment::Iterator& TextFragment::Iterator::operator++()
47 {
48 pImpl->_utf8Iter.operator++();
49 return *this;
50 }
51
operator ++(int i)52 CodePoint TextFragment::Iterator::operator++(int i)
53 {
54 CodePoint preIncrementValue = pImpl->_utf8Iter.operator*();
55 pImpl->_utf8Iter.operator++(i);
56 return preIncrementValue;
57 }
58
operator !=(TextFragment::Iterator lhs,TextFragment::Iterator rhs)59 bool operator!=(TextFragment::Iterator lhs, TextFragment::Iterator rhs)
60 {
61 return lhs.pImpl->_utf8Iter != rhs.pImpl->_utf8Iter;
62 }
63
operator ==(TextFragment::Iterator lhs,TextFragment::Iterator rhs)64 bool operator==(TextFragment::Iterator lhs, TextFragment::Iterator rhs)
65 {
66 return !(lhs.pImpl->_utf8Iter != rhs.pImpl->_utf8Iter);
67 }
68
69 // TextFragment
70
TextFragment()71 TextFragment::TextFragment() noexcept
72 {
73 mSize = 0;
74 mpText = mLocalText;
75 nullTerminate();
76 }
77
TextFragment(const char * pChars)78 TextFragment::TextFragment(const char* pChars) noexcept
79 {
80 create(strlen(pChars));
81 // a bad alloc will result in this being a null object.
82 // copy the input string into local storage
83 if (mpText)
84 {
85 std::copy(pChars, pChars + mSize, mpText);
86 nullTerminate();
87 }
88 }
89
90 // this ctor can be used to save the work of counting the length if we have a
91 // length already, as with static HashedCharArrays.
TextFragment(const char * pChars,size_t len)92 TextFragment::TextFragment(const char* pChars, size_t len) noexcept
93 {
94 create(len);
95 if (mpText)
96 {
97 std::copy(pChars, pChars + mSize, mpText);
98 nullTerminate();
99 }
100 }
101
TextFragment(CodePoint c)102 TextFragment::TextFragment(CodePoint c) noexcept
103 {
104 if (!validateCodePoint(c))
105 {
106 c = 0x2639; // sad face
107 }
108 // all possible codepoints fit into local text
109 char* end = utf::internal::utf_traits<utf::utf8>::encode(c, mLocalText);
110 mSize = end - mLocalText;
111 mpText = mLocalText;
112 nullTerminate();
113 }
114
lengthInBytes() const115 size_t TextFragment::lengthInBytes() const { return mSize; }
116
lengthInCodePoints() const117 size_t TextFragment::lengthInCodePoints() const
118 {
119 utf::stringview<const char*> sv(mpText, mpText + mSize);
120 return sv.codepoints();
121 }
122
begin() const123 TextFragment::Iterator TextFragment::begin() const { return TextFragment::Iterator(getText()); }
124
end() const125 TextFragment::Iterator TextFragment::end() const { return Iterator(getText() + lengthInBytes()); }
126
TextFragment(const TextFragment & a)127 TextFragment::TextFragment(const TextFragment& a) noexcept
128 {
129 construct(a.getText(), a.lengthInBytes());
130 }
131
132 // just copy the data. If we want to optimize and use reference-counted strings
133 // at some point, look at fix_str for ideas.
operator =(const TextFragment & b)134 TextFragment& TextFragment::operator=(const TextFragment& b) noexcept
135 {
136 if (this != &b)
137 {
138 dispose();
139 create(b.mSize);
140 if (mpText)
141 {
142 const char* bText = b.mpText;
143 std::copy(bText, bText + mSize, mpText);
144 nullTerminate();
145 }
146 }
147 return *this;
148 }
149
TextFragment(TextFragment && b)150 TextFragment::TextFragment(TextFragment&& b) noexcept { moveDataFromOther(b); }
151
operator =(TextFragment && b)152 TextFragment& TextFragment::operator=(TextFragment&& b) noexcept
153 {
154 dispose();
155 moveDataFromOther(b);
156 return *this;
157 }
158
159 // multiple-fragment constructors, used instead of operator+
TextFragment(const TextFragment & a,const TextFragment & b)160 TextFragment::TextFragment(const TextFragment& a, const TextFragment& b) noexcept
161 {
162 construct(a.getText(), a.lengthInBytes(), b.getText(), b.lengthInBytes());
163 }
164
TextFragment(const TextFragment & t1,const TextFragment & t2,const TextFragment & t3)165 TextFragment::TextFragment(const TextFragment& t1, const TextFragment& t2,
166 const TextFragment& t3) noexcept
167 {
168 construct(t1.getText(), t1.lengthInBytes(), t2.getText(), t2.lengthInBytes(), t3.getText(),
169 t3.lengthInBytes());
170 }
171
TextFragment(const TextFragment & t1,const TextFragment & t2,const TextFragment & t3,const TextFragment & t4)172 TextFragment::TextFragment(const TextFragment& t1, const TextFragment& t2, const TextFragment& t3,
173 const TextFragment& t4) noexcept
174 {
175 construct(t1.getText(), t1.lengthInBytes(), t2.getText(), t2.lengthInBytes(), t3.getText(),
176 t3.lengthInBytes(), t4.getText(), t4.lengthInBytes());
177 }
178
~TextFragment()179 TextFragment::~TextFragment() noexcept { dispose(); }
180
construct(const char * s1,size_t len1,const char * s2,size_t len2,const char * s3,size_t len3,const char * s4,size_t len4)181 void TextFragment::construct(const char* s1, size_t len1, const char* s2, size_t len2,
182 const char* s3, size_t len3, const char* s4, size_t len4) noexcept
183 {
184 create(len1 + len2 + len3 + len4);
185 if (mpText)
186 {
187 if (len1) std::copy(s1, s1 + len1, mpText);
188 if (len2) std::copy(s2, s2 + len2, mpText + len1);
189 if (len3) std::copy(s3, s3 + len3, mpText + len1 + len2);
190 if (len4) std::copy(s4, s4 + len4, mpText + len1 + len2 + len3);
191 nullTerminate();
192 }
193 }
194
create(size_t size)195 void TextFragment::create(size_t size) noexcept
196 {
197 mSize = size;
198 const size_t nullTerminatedSize = size + 1;
199 if (nullTerminatedSize > kShortFragmentSizeInChars)
200 {
201 mpText = static_cast<char*>(malloc(nullTerminatedSize));
202 }
203 else
204 {
205 mpText = mLocalText;
206 }
207 }
208
nullTerminate()209 void TextFragment::nullTerminate() noexcept { mpText[mSize] = 0; }
210
dispose()211 void TextFragment::dispose() noexcept
212 {
213 if (mpText)
214 {
215 assert(mpText[mSize] == 0);
216 if (mpText != mLocalText)
217 {
218 // free an external text. If the alloc has failed the ptr might be 0,
219 // which is OK
220 free(mpText);
221 }
222 mpText = 0;
223 }
224 }
225
moveDataFromOther(TextFragment & b)226 void TextFragment::moveDataFromOther(TextFragment& b)
227 {
228 mSize = b.mSize;
229 if (mSize >= kShortFragmentSizeInChars)
230 {
231 // move the data
232 mpText = b.mpText;
233 }
234 else
235 {
236 /*
237 TODO use SmallStackBuffer! and test
238 */
239
240 // point to local storage and copy data
241 mpText = mLocalText;
242 std::copy(b.mLocalText, b.mLocalText + mSize, mLocalText);
243 nullTerminate();
244 }
245
246 // mark b as empty, nothing to dispose
247 b.mpText = b.mLocalText;
248 b.mSize = 0;
249 b.nullTerminate();
250 }
251
validateCodePoint(CodePoint c)252 bool validateCodePoint(CodePoint c) { return utf::internal::validate_codepoint(c); }
253
254 // return UTF-8 encoded vector of bytes without null terminator
textToByteVector(TextFragment frag)255 std::vector<uint8_t> textToByteVector(TextFragment frag)
256 {
257 return std::vector<uint8_t>(frag.getText(), frag.getText() + frag.lengthInBytes());
258 }
259
byteVectorToText(const std::vector<uint8_t> & v)260 TextFragment byteVectorToText(const std::vector<uint8_t>& v)
261 {
262 if (!v.size()) return TextFragment();
263 const uint8_t* p = v.data();
264 return TextFragment(reinterpret_cast<const char*>(p), v.size());
265 }
266
267 // TODO small stack objects here to make random access class, don't use
268 // std::vector
textToCodePoints(TextFragment frag)269 std::vector<CodePoint> textToCodePoints(TextFragment frag)
270 {
271 std::vector<CodePoint> r;
272 for (CodePoint c : frag)
273 {
274 r.push_back(c);
275 }
276 return r;
277 }
278
codePointsToText(std::vector<CodePoint> cv)279 TextFragment codePointsToText(std::vector<CodePoint> cv)
280 {
281 auto sv = utf::make_stringview(cv.begin(), cv.end());
282 std::vector<char> outVec;
283 sv.to<utf::utf8>(std::back_inserter(outVec));
284 return TextFragment(outVec.data(), outVec.size());
285 }
286
287 } // namespace ml
288