1 // madronalib: a C++ framework for DSP applications.
2 // Copyright (c) 2020 Madrona Labs LLC. http://www.madronalabs.com
3 // Distributed under the MIT license: http://madrona-labs.mit-license.org/
4 
5 // assignable but otherwise immutable UTF-8 text object class
6 
7 #pragma once
8 
9 #include <memory>
10 #include <string>  // to remove
11 #include <vector>
12 
13 namespace ml
14 {
15 // ----------------------------------------------------------------
16 // TextFragment - a minimal, immutable string class. Guaranteed not to allocate
17 // heap if the length in bytes is below kShortFragmentSize.
18 
19 static constexpr int kShortFragmentSizeInCodePoints = 16;
20 static constexpr int kShortFragmentSizeInChars = kShortFragmentSizeInCodePoints * 4;
21 
22 using CodePoint = char32_t;
23 
24 class TextFragment
25 {
26  public:
27   class Iterator
28   {
29     class Impl;
30     std::unique_ptr<Impl> pImpl;
31 
32    public:
33     Iterator(const char* pos);
34 
35     ~Iterator();  // defined in the implementation file, where impl is a
36                   // complete type
37     //	Iterator(Iterator&&) = default;
38     Iterator(const Iterator&);
39     //	Iterator& operator=(Iterator&&); // defined in the implementation file
40     Iterator& operator=(const Iterator&) = delete;
41 
42     CodePoint operator*();
43     //		CodePoint operator->() { return _utf8Iter.operator->(); }
44     Iterator& operator++();
45     CodePoint operator++(int i);
46 
47     friend bool operator!=(Iterator lhs, Iterator rhs);
48     friend bool operator==(Iterator lhs, Iterator rhs);
49   };
50 
51   TextFragment() noexcept;
52 
53   /*
54    this could be a good idea but the (const char*) ctor is taking precedence,
55   revisit template<size_t N> TextFragment(const char(&p)[N]) noexcept : mSize(N)
56   {
57           std::cout << "?";
58           create();
59           if(mpText)
60           {
61                   std::copy(p, p + mSize, mpText);
62                   nullTerminate();
63           }
64   }
65    */
66 
67   //
68   TextFragment(const char* pChars) noexcept;
69 
70   // this ctor can be used to save the work of counting the length of the input
71   // if we know it already, as with static HashedCharArrays.
72   TextFragment(const char* pChars, size_t len) noexcept;
73 
74   // single code point ctor
75   TextFragment(CodePoint c) noexcept;
76 
77   // copy ctor
78   TextFragment(const TextFragment& a) noexcept;
79 
80   // copy assignment operator: TextFragment is assignable but otherwise
81   // immutable.
82   TextFragment& operator=(const TextFragment& b) noexcept;
83 
84   // move ctor
85   TextFragment(TextFragment&& b) noexcept;
86 
87   // move assignment operator
88   TextFragment& operator=(TextFragment&& b) noexcept;
89 
90   // use these ctors instead of operator+.
91   TextFragment(const TextFragment& a, const TextFragment& b) noexcept;
92   TextFragment(const TextFragment& a, const TextFragment& b, const TextFragment& c) noexcept;
93   TextFragment(const TextFragment& a, const TextFragment& b, const TextFragment& c,
94                const TextFragment& d) noexcept;
95 
96   ~TextFragment() noexcept;
97 
98   explicit operator bool() const { return mSize > 0; }
99 
100   size_t lengthInBytes() const;
101 
102   size_t lengthInCodePoints() const;
103 
104   Iterator begin() const;
105   Iterator end() const;
106 
getText()107   inline const char* getText() const { return mpText; }
108 
beginsWith(const TextFragment & fb)109   inline bool beginsWith(const TextFragment& fb) const
110   {
111     size_t lenA = lengthInBytes();
112     size_t lenB = fb.lengthInBytes();
113     if (lenB > lenA) return false;
114     for (int i = 0; i < lenB; ++i)
115     {
116       if (mpText[i] != fb.mpText[i])
117       {
118         return false;
119       }
120     }
121     return true;
122   }
123 
endsWith(const TextFragment & fb)124   inline bool endsWith(const TextFragment& fb) const
125   {
126     size_t lenA = lengthInBytes();
127     size_t lenB = fb.lengthInBytes();
128     if (lenB > lenA) return false;
129     for (int i = 0; i < lenB; ++i)
130     {
131       if (mpText[lenA - lenB + i] != fb.mpText[i])
132       {
133         return false;
134       }
135     }
136     return true;
137   }
138 
139   // deprecated! MLTEST
toString()140   inline std::string toString() const { return std::string(mpText); }
141 
142  private:
143   void construct(const char* s1, size_t len1, const char* s2 = nullptr, size_t len2 = 0,
144                  const char* s3 = nullptr, size_t len3 = 0, const char* s4 = nullptr,
145                  size_t len4 = 0) noexcept;
146 
147   void create(size_t size) noexcept;
148   void nullTerminate() noexcept;
149   void dispose() noexcept;
150   void moveDataFromOther(TextFragment& b);
151 
152   // TODO these things could share space, as in SmallStackBuffer
153   char* mpText;
154   char mLocalText[kShortFragmentSizeInChars];
155 
156   // size of data in bytes, without null terminator
157   size_t mSize;
158 };
159 
compareSizedCharArrays(const char * pA,size_t lenA,const char * pB,size_t lenB)160 inline bool compareSizedCharArrays(const char* pA, size_t lenA, const char* pB, size_t lenB)
161 {
162   if (lenA != lenB) return false;
163   if ((lenA == 0) && (lenB == 0)) return true;
164 
165   for (size_t n = 0; n < lenA; ++n)
166   {
167     if (pA[n] != pB[n])
168     {
169       return false;
170     }
171   }
172 
173   return true;
174 }
175 
176 // TODO made operator== a free function-	do likewise for other classes
177 
178 inline bool operator==(const TextFragment a, const TextFragment b)
179 {
180   return compareSizedCharArrays(a.getText(), a.lengthInBytes(), b.getText(), b.lengthInBytes());
181 }
182 
183 inline bool operator!=(TextFragment a, TextFragment b) { return !(a == b); }
184 
185 inline std::ostream& operator<<(std::ostream& out, const TextFragment& r)
186 {
187   const char* c = r.getText();
188   out << c;
189   return out;
190 }
191 
192 // ----------------------------------------------------------------
193 // Text - a placeholder for more features later like localization
194 
195 typedef TextFragment Text;
196 
197 // ----------------------------------------------------------------
198 // functions
199 
200 bool validateCodePoint(CodePoint c);
201 
202 std::vector<uint8_t> textToByteVector(TextFragment frag);
203 TextFragment byteVectorToText(const std::vector<uint8_t>& v);
204 
205 std::vector<CodePoint> textToCodePoints(TextFragment frag);
206 TextFragment codePointsToText(std::vector<CodePoint> cv);
207 
208 }  // namespace ml
209