1 // madronalib: a C++ framework for DSP applications.
2 // Copyright (c) 2020 Madrona Labs LLC. http://www.madronalabs.com
3 // Distributed under the MIT license: http://madrona-labs.mit-license.org/
4 
5 #include "MLText.h"
6 
7 #include <cstring>
8 #include <iostream>
9 #include <vector>
10 
11 #include "MLMemoryUtils.h"
12 #include "utf.hpp"
13 
14 namespace ml
15 {
16 // Iterator::Impl
17 
18 class TextFragment::Iterator::Impl
19 {
20   friend class TextFragment::Iterator;
21   friend bool operator!=(Iterator lhs, Iterator rhs);
22   friend bool operator==(Iterator lhs, Iterator rhs);
23 
24   utf::codepoint_iterator<const char*> _utf8Iter;
25 
26  public:
Impl(const char * pos)27   Impl(const char* pos) : _utf8Iter(utf::codepoint_iterator<const char*>(pos)) {}
Impl(const utf::codepoint_iterator<const char * > & utf_iter)28   Impl(const utf::codepoint_iterator<const char*>& utf_iter) : _utf8Iter(utf_iter) {}
29 };
30 
31 // Iterator
32 
Iterator(const char * pos)33 TextFragment::Iterator::Iterator(const char* pos) { pImpl = std::unique_ptr<Impl>(new Impl(pos)); }
34 
Iterator(const Iterator & it)35 TextFragment::Iterator::Iterator(const Iterator& it)  // = default;
36 {
37   pImpl = std::unique_ptr<Impl>(new Impl(it.pImpl->_utf8Iter));
38 }
39 
40 TextFragment::Iterator::~Iterator() = default;
41 
operator *()42 CodePoint TextFragment::Iterator::operator*() { return pImpl->_utf8Iter.operator*(); }
43 
44 //		CodePoint operator->() { return _utf8Iter.operator->(); }
45 
operator ++()46 TextFragment::Iterator& TextFragment::Iterator::operator++()
47 {
48   pImpl->_utf8Iter.operator++();
49   return *this;
50 }
51 
operator ++(int i)52 CodePoint TextFragment::Iterator::operator++(int i)
53 {
54   CodePoint preIncrementValue = pImpl->_utf8Iter.operator*();
55   pImpl->_utf8Iter.operator++(i);
56   return preIncrementValue;
57 }
58 
operator !=(TextFragment::Iterator lhs,TextFragment::Iterator rhs)59 bool operator!=(TextFragment::Iterator lhs, TextFragment::Iterator rhs)
60 {
61   return lhs.pImpl->_utf8Iter != rhs.pImpl->_utf8Iter;
62 }
63 
operator ==(TextFragment::Iterator lhs,TextFragment::Iterator rhs)64 bool operator==(TextFragment::Iterator lhs, TextFragment::Iterator rhs)
65 {
66   return !(lhs.pImpl->_utf8Iter != rhs.pImpl->_utf8Iter);
67 }
68 
69 // TextFragment
70 
TextFragment()71 TextFragment::TextFragment() noexcept
72 {
73   mSize = 0;
74   mpText = mLocalText;
75   nullTerminate();
76 }
77 
TextFragment(const char * pChars)78 TextFragment::TextFragment(const char* pChars) noexcept
79 {
80   create(strlen(pChars));
81   // a bad alloc will result in this being a null object.
82   // copy the input string into local storage
83   if (mpText)
84   {
85     std::copy(pChars, pChars + mSize, mpText);
86     nullTerminate();
87   }
88 }
89 
90 // this ctor can be used to save the work of counting the length if we have a
91 // length already, as with static HashedCharArrays.
TextFragment(const char * pChars,size_t len)92 TextFragment::TextFragment(const char* pChars, size_t len) noexcept
93 {
94   create(len);
95   if (mpText)
96   {
97     std::copy(pChars, pChars + mSize, mpText);
98     nullTerminate();
99   }
100 }
101 
TextFragment(CodePoint c)102 TextFragment::TextFragment(CodePoint c) noexcept
103 {
104   if (!validateCodePoint(c))
105   {
106     c = 0x2639;  // sad face
107   }
108   // all possible codepoints fit into local text
109   char* end = utf::internal::utf_traits<utf::utf8>::encode(c, mLocalText);
110   mSize = end - mLocalText;
111   mpText = mLocalText;
112   nullTerminate();
113 }
114 
lengthInBytes() const115 size_t TextFragment::lengthInBytes() const { return mSize; }
116 
lengthInCodePoints() const117 size_t TextFragment::lengthInCodePoints() const
118 {
119   utf::stringview<const char*> sv(mpText, mpText + mSize);
120   return sv.codepoints();
121 }
122 
begin() const123 TextFragment::Iterator TextFragment::begin() const { return TextFragment::Iterator(getText()); }
124 
end() const125 TextFragment::Iterator TextFragment::end() const { return Iterator(getText() + lengthInBytes()); }
126 
TextFragment(const TextFragment & a)127 TextFragment::TextFragment(const TextFragment& a) noexcept
128 {
129   construct(a.getText(), a.lengthInBytes());
130 }
131 
132 // just copy the data. If we want to optimize and use reference-counted strings
133 // at some point, look at fix_str for ideas.
operator =(const TextFragment & b)134 TextFragment& TextFragment::operator=(const TextFragment& b) noexcept
135 {
136   if (this != &b)
137   {
138     dispose();
139     create(b.mSize);
140     if (mpText)
141     {
142       const char* bText = b.mpText;
143       std::copy(bText, bText + mSize, mpText);
144       nullTerminate();
145     }
146   }
147   return *this;
148 }
149 
TextFragment(TextFragment && b)150 TextFragment::TextFragment(TextFragment&& b) noexcept { moveDataFromOther(b); }
151 
operator =(TextFragment && b)152 TextFragment& TextFragment::operator=(TextFragment&& b) noexcept
153 {
154   dispose();
155   moveDataFromOther(b);
156   return *this;
157 }
158 
159 // multiple-fragment constructors, used instead of operator+
TextFragment(const TextFragment & a,const TextFragment & b)160 TextFragment::TextFragment(const TextFragment& a, const TextFragment& b) noexcept
161 {
162   construct(a.getText(), a.lengthInBytes(), b.getText(), b.lengthInBytes());
163 }
164 
TextFragment(const TextFragment & t1,const TextFragment & t2,const TextFragment & t3)165 TextFragment::TextFragment(const TextFragment& t1, const TextFragment& t2,
166                            const TextFragment& t3) noexcept
167 {
168   construct(t1.getText(), t1.lengthInBytes(), t2.getText(), t2.lengthInBytes(), t3.getText(),
169             t3.lengthInBytes());
170 }
171 
TextFragment(const TextFragment & t1,const TextFragment & t2,const TextFragment & t3,const TextFragment & t4)172 TextFragment::TextFragment(const TextFragment& t1, const TextFragment& t2, const TextFragment& t3,
173                            const TextFragment& t4) noexcept
174 {
175   construct(t1.getText(), t1.lengthInBytes(), t2.getText(), t2.lengthInBytes(), t3.getText(),
176             t3.lengthInBytes(), t4.getText(), t4.lengthInBytes());
177 }
178 
~TextFragment()179 TextFragment::~TextFragment() noexcept { dispose(); }
180 
construct(const char * s1,size_t len1,const char * s2,size_t len2,const char * s3,size_t len3,const char * s4,size_t len4)181 void TextFragment::construct(const char* s1, size_t len1, const char* s2, size_t len2,
182                              const char* s3, size_t len3, const char* s4, size_t len4) noexcept
183 {
184   create(len1 + len2 + len3 + len4);
185   if (mpText)
186   {
187     if (len1) std::copy(s1, s1 + len1, mpText);
188     if (len2) std::copy(s2, s2 + len2, mpText + len1);
189     if (len3) std::copy(s3, s3 + len3, mpText + len1 + len2);
190     if (len4) std::copy(s4, s4 + len4, mpText + len1 + len2 + len3);
191     nullTerminate();
192   }
193 }
194 
create(size_t size)195 void TextFragment::create(size_t size) noexcept
196 {
197   mSize = size;
198   const size_t nullTerminatedSize = size + 1;
199   if (nullTerminatedSize > kShortFragmentSizeInChars)
200   {
201     mpText = static_cast<char*>(malloc(nullTerminatedSize));
202   }
203   else
204   {
205     mpText = mLocalText;
206   }
207 }
208 
nullTerminate()209 void TextFragment::nullTerminate() noexcept { mpText[mSize] = 0; }
210 
dispose()211 void TextFragment::dispose() noexcept
212 {
213   if (mpText)
214   {
215     assert(mpText[mSize] == 0);
216     if (mpText != mLocalText)
217     {
218       // free an external text. If the alloc has failed the ptr might be 0,
219       // which is OK
220       free(mpText);
221     }
222     mpText = 0;
223   }
224 }
225 
moveDataFromOther(TextFragment & b)226 void TextFragment::moveDataFromOther(TextFragment& b)
227 {
228   mSize = b.mSize;
229   if (mSize >= kShortFragmentSizeInChars)
230   {
231     // move the data
232     mpText = b.mpText;
233   }
234   else
235   {
236     /*
237      TODO use SmallStackBuffer! and test
238      */
239 
240     // point to local storage and copy data
241     mpText = mLocalText;
242     std::copy(b.mLocalText, b.mLocalText + mSize, mLocalText);
243     nullTerminate();
244   }
245 
246   // mark b as empty, nothing to dispose
247   b.mpText = b.mLocalText;
248   b.mSize = 0;
249   b.nullTerminate();
250 }
251 
validateCodePoint(CodePoint c)252 bool validateCodePoint(CodePoint c) { return utf::internal::validate_codepoint(c); }
253 
254 // return UTF-8 encoded vector of bytes without null terminator
textToByteVector(TextFragment frag)255 std::vector<uint8_t> textToByteVector(TextFragment frag)
256 {
257   return std::vector<uint8_t>(frag.getText(), frag.getText() + frag.lengthInBytes());
258 }
259 
byteVectorToText(const std::vector<uint8_t> & v)260 TextFragment byteVectorToText(const std::vector<uint8_t>& v)
261 {
262   if (!v.size()) return TextFragment();
263   const uint8_t* p = v.data();
264   return TextFragment(reinterpret_cast<const char*>(p), v.size());
265 }
266 
267 // TODO small stack objects here to make random access class, don't use
268 // std::vector
textToCodePoints(TextFragment frag)269 std::vector<CodePoint> textToCodePoints(TextFragment frag)
270 {
271   std::vector<CodePoint> r;
272   for (CodePoint c : frag)
273   {
274     r.push_back(c);
275   }
276   return r;
277 }
278 
codePointsToText(std::vector<CodePoint> cv)279 TextFragment codePointsToText(std::vector<CodePoint> cv)
280 {
281   auto sv = utf::make_stringview(cv.begin(), cv.end());
282   std::vector<char> outVec;
283   sv.to<utf::utf8>(std::back_inserter(outVec));
284   return TextFragment(outVec.data(), outVec.size());
285 }
286 
287 }  // namespace ml
288