1 /**
2 * Copyright (c) 2014, Timothy Stack
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * * Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 * * Neither the name of Timothy Stack nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * @file intern_string.hh
30 */
31
32 #ifndef intern_string_hh
33 #define intern_string_hh
34
35 #include <string.h>
36 #include <sys/types.h>
37
38 #include <string>
39
40 #include "optional.hpp"
41 #include "strnatcmp.h"
42 #include "fmt/format.h"
43
44 struct string_fragment {
45 using iterator = const char *;
46
string_fragmentstring_fragment47 explicit string_fragment(const char *str = "", int begin = 0, int end = -1)
48 : sf_string(str), sf_begin(begin), sf_end(end == -1 ? strlen(str) : end) {
49 };
50
string_fragmentstring_fragment51 explicit string_fragment(const unsigned char *str, int begin = 0, int end = -1)
52 : sf_string((const char *) str),
53 sf_begin(begin),
54 sf_end(end == -1 ? strlen((const char *) str) : end) {
55 };
56
string_fragmentstring_fragment57 string_fragment(const std::string &str)
58 : sf_string(str.c_str()), sf_begin(0), sf_end(str.length()) {
59
60 }
61
is_validstring_fragment62 bool is_valid() const {
63 return this->sf_begin != -1;
64 };
65
lengthstring_fragment66 int length() const {
67 return this->sf_end - this->sf_begin;
68 };
69
datastring_fragment70 const char *data() const {
71 return &this->sf_string[this->sf_begin];
72 }
73
beginstring_fragment74 iterator begin() const {
75 return &this->sf_string[this->sf_begin];
76 }
77
endstring_fragment78 iterator end() const {
79 return &this->sf_string[this->sf_end];
80 }
81
emptystring_fragment82 bool empty() const {
83 return !this->is_valid() || length() == 0;
84 };
85
operator []string_fragment86 char operator[](int index) const {
87 return this->sf_string[sf_begin + index];
88 };
89
operator ==string_fragment90 bool operator==(const std::string &str) const {
91 if (this->length() != (int) str.length()) {
92 return false;
93 }
94
95 return memcmp(&this->sf_string[this->sf_begin],
96 str.c_str(),
97 str.length()) == 0;
98 };
99
operator ==string_fragment100 bool operator==(const string_fragment &sf) const {
101 if (this->length() != sf.length()) {
102 return false;
103 }
104
105 return memcmp(this->data(), sf.data(), sf.length()) == 0;
106 };
107
iequalstring_fragment108 bool iequal(const string_fragment &sf) const {
109 if (this->length() != sf.length()) {
110 return false;
111 }
112
113 return strnatcasecmp(this->length(), this->data(),
114 sf.length(), sf.data()) == 0;
115 };
116
operator ==string_fragment117 bool operator==(const char *str) const {
118 size_t len = strlen(str);
119
120 return len == (size_t) this->length() &&
121 strncmp(this->data(), str, this->length()) == 0;
122 };
123
operator !=string_fragment124 bool operator!=(const char *str) const {
125 return !(*this == str);
126 }
127
startswithstring_fragment128 bool startswith(const char *prefix) const {
129 auto iter = this->begin();
130
131 while (*prefix != '\0' && *prefix == *iter && iter < this->end()) {
132 prefix += 1;
133 iter += 1;
134 }
135
136 return *prefix == '\0';
137 }
138
substrstring_fragment139 string_fragment substr(int begin) const {
140 return string_fragment{
141 this->sf_string,
142 this->sf_begin + begin,
143 this->sf_end
144 };
145 }
146
findstring_fragment147 nonstd::optional<size_t> find(char ch) const {
148 for (int lpc = this->sf_begin; lpc < this->sf_end; lpc++) {
149 if (this->sf_string[lpc] == ch) {
150 return lpc;
151 }
152 }
153
154 return nonstd::nullopt;
155 }
156
157 template<typename P>
consumestring_fragment158 nonstd::optional<string_fragment> consume(P predicate) const {
159 int consumed = 0;
160 while (consumed < this->length()) {
161 if (!predicate(this->data()[consumed])) {
162 break;
163 }
164
165 consumed += 1;
166 }
167
168 if (consumed == 0) {
169 return nonstd::nullopt;
170 }
171
172 return string_fragment{
173 this->sf_string,
174 this->sf_begin + consumed,
175 this->sf_end,
176 };
177 }
178
consume_nstring_fragment179 nonstd::optional<string_fragment> consume_n(int amount) const {
180 if (amount > this->length()) {
181 return nonstd::nullopt;
182 }
183
184 return string_fragment{
185 this->sf_string,
186 this->sf_begin + amount,
187 this->sf_end,
188 };
189 }
190
191 template<typename P>
skipstring_fragment192 string_fragment skip(P predicate) const {
193 int offset = 0;
194 while (offset < this->length() && predicate(this->data()[offset])) {
195 offset += 1;
196 }
197
198 return string_fragment{
199 this->sf_string,
200 this->sf_begin + offset,
201 this->sf_end,
202 };
203 }
204
205 using split_result = nonstd::optional<std::pair<string_fragment, string_fragment>>;
206
207 template<typename P>
split_whilestring_fragment208 split_result split_while(P& predicate) const {
209 int consumed = 0;
210 while (consumed < this->length()) {
211 if (!predicate(this->data()[consumed])) {
212 break;
213 }
214
215 consumed += 1;
216 }
217
218 if (consumed == 0) {
219 return nonstd::nullopt;
220 }
221
222 return std::make_pair(
223 string_fragment{
224 this->sf_string,
225 this->sf_begin,
226 this->sf_begin + consumed,
227 },
228 string_fragment{
229 this->sf_string,
230 this->sf_begin + consumed,
231 this->sf_end,
232 }
233 );
234 }
235
236 struct tag1 {
237 const char t_value;
238
operator ()string_fragment::tag1239 bool operator()(char ch) const {
240 return this->t_value == ch;
241 }
242 };
243
244 struct quoted_string_body {
245 bool qs_in_escape{false};
246
operator ()string_fragment::quoted_string_body247 bool operator()(char ch) {
248 if (this->qs_in_escape) {
249 this->qs_in_escape = false;
250 return true;
251 } else if (ch == '\\') {
252 this->qs_in_escape = true;
253 return true;
254 } else if (ch == '"') {
255 return false;
256 } else {
257 return true;
258 }
259 }
260 };
261
to_stringstring_fragment262 const char *to_string(char *buf) const {
263 memcpy(buf, this->data(), this->length());
264 buf[this->length()] = '\0';
265
266 return buf;
267 };
268
to_stringstring_fragment269 std::string to_string() const {
270 return {this->data(), (size_t) this->length()};
271 }
272
clearstring_fragment273 void clear() {
274 this->sf_begin = 0;
275 this->sf_end = 0;
276 };
277
invalidatestring_fragment278 void invalidate() {
279 this->sf_begin = -1;
280 this->sf_end = -1;
281 };
282
trimstring_fragment283 void trim(const char *tokens) {
284 while (this->sf_begin < this->sf_end) {
285 bool found = false;
286
287 for (int lpc = 0; tokens[lpc] != '\0'; lpc++) {
288 if (this->sf_string[this->sf_begin] == tokens[lpc]) {
289 found = true;
290 break;
291 }
292 }
293 if (!found) {
294 break;
295 }
296
297 this->sf_begin += 1;
298 }
299 while (this->sf_begin < this->sf_end) {
300 bool found = false;
301
302 for (int lpc = 0; tokens[lpc] != '\0'; lpc++) {
303 if (this->sf_string[this->sf_end - 1] == tokens[lpc]) {
304 found = true;
305 break;
306 }
307 }
308 if (!found) {
309 break;
310 }
311
312 this->sf_end -= 1;
313 }
314 }
315
316 const char *sf_string;
317 int sf_begin;
318 int sf_end;
319 };
320
operator <(const char * left,const string_fragment & right)321 inline bool operator<(const char *left, const string_fragment &right) {
322 int rc = strncmp(left, right.data(), right.length());
323 return rc < 0;
324 }
325
operator <(const string_fragment & left,const char * right)326 inline bool operator<(const string_fragment &left, const char *right) {
327 return strncmp(left.data(), right, left.length()) < 0;
328 }
329
330 class intern_string {
331
332 public:
333 static const intern_string *lookup(const char *str, ssize_t len) noexcept;
334
335 static const intern_string *lookup(const string_fragment &sf) noexcept;
336
337 static const intern_string *lookup(const std::string &str) noexcept;
338
get() const339 const char *get() const {
340 return this->is_str.c_str();
341 };
342
size() const343 size_t size() const {
344 return this->is_str.size();
345 }
346
to_string() const347 std::string to_string() const {
348 return this->is_str;
349 }
350
to_string_fragment() const351 string_fragment to_string_fragment() const {
352 return string_fragment{this->is_str};
353 }
354
startswith(const char * prefix) const355 bool startswith(const char *prefix) const {
356 const char *curr = this->is_str.data();
357
358 while (*prefix != '\0' && *prefix == *curr) {
359 prefix += 1;
360 curr += 1;
361 }
362
363 return *prefix == '\0';
364 }
365
366 struct intern_table;
367 static std::shared_ptr<intern_table> get_table_lifetime();
368 private:
369 friend intern_table;
370
intern_string(const char * str,ssize_t len)371 intern_string(const char *str, ssize_t len)
372 : is_next(nullptr), is_str(str, (size_t) len) {
373 }
374
375 intern_string *is_next;
376 std::string is_str;
377 };
378
379 using intern_table_lifetime = std::shared_ptr<intern_string::intern_table>;
380
381 class intern_string_t {
382 public:
383 using iterator = const char *;
384
intern_string_t(const intern_string * is=nullptr)385 intern_string_t(const intern_string *is = nullptr) : ist_interned_string(is) {
386
387 }
388
unwrap() const389 const intern_string *unwrap() const {
390 return this->ist_interned_string;
391 }
392
clear()393 void clear() {
394 this->ist_interned_string = nullptr;
395 };
396
empty() const397 bool empty() const {
398 return this->ist_interned_string == nullptr;
399 }
400
get() const401 const char *get() const {
402 if (this->empty()) {
403 return "";
404 }
405 return this->ist_interned_string->get();
406 }
407
begin() const408 iterator begin() const {
409 return this->get();
410 }
411
end() const412 iterator end() const {
413 return this->get() + this->size();
414 }
415
size() const416 size_t size() const {
417 if (this->ist_interned_string == nullptr) {
418 return 0;
419 }
420 return this->ist_interned_string->size();
421 }
422
hash() const423 size_t hash() const {
424 auto ptr = (uintptr_t) this->ist_interned_string;
425
426 return ptr;
427 }
428
to_string() const429 std::string to_string() const {
430 if (this->ist_interned_string == nullptr) {
431 return "";
432 }
433 return this->ist_interned_string->to_string();
434 }
435
to_string_fragment() const436 string_fragment to_string_fragment() const {
437 if (this->ist_interned_string == nullptr) {
438 return string_fragment{"", 0, 0};
439 }
440 return this->ist_interned_string->to_string_fragment();
441 }
442
operator <(const intern_string_t & rhs) const443 bool operator<(const intern_string_t &rhs) const {
444 return strcmp(this->get(), rhs.get()) < 0;
445 }
446
operator ==(const intern_string_t & rhs) const447 bool operator==(const intern_string_t &rhs) const {
448 return this->ist_interned_string == rhs.ist_interned_string;
449 }
450
operator !=(const intern_string_t & rhs) const451 bool operator!=(const intern_string_t &rhs) const {
452 return !(*this == rhs);
453 }
454
operator ==(const char * rhs) const455 bool operator==(const char *rhs) const {
456 return strcmp(this->get(), rhs) == 0;
457 }
458
operator !=(const char * rhs) const459 bool operator!=(const char *rhs) const {
460 return strcmp(this->get(), rhs) != 0;
461 }
462
463 private:
464 const intern_string *ist_interned_string;
465 };
466
467 unsigned long hash_str(const char *str, size_t len);
468
469 namespace fmt {
470 template<>
471 struct formatter<string_fragment> : formatter<string_view> {
472 template<typename FormatContext>
formatfmt::formatter473 auto format(const string_fragment &sf, FormatContext &ctx)
474 {
475 return formatter<string_view>::format(
476 string_view{sf.data(), (size_t) sf.length()}, ctx);
477 }
478 };
479
480 template<>
481 struct formatter<intern_string_t> : formatter<string_view> {
482 template<typename FormatContext>
formatfmt::formatter483 auto format(const intern_string_t& is, FormatContext &ctx)
484 {
485 return formatter<string_view>::format(
486 string_view{is.get(), (size_t) is.size()}, ctx);
487 }
488 };
489 }
490
491 namespace std {
492 template <>
493 struct hash<const intern_string_t> {
operator ()std::hash494 std::size_t operator()(const intern_string_t &ist) const {
495 return ist.hash();
496 }
497 };
498 }
499
operator <(const char * left,const intern_string_t & right)500 inline bool operator<(const char *left, const intern_string_t &right) {
501 int rc = strncmp(left, right.get(), right.size());
502 return rc < 0;
503 }
504
operator <(const intern_string_t & left,const char * right)505 inline bool operator<(const intern_string_t &left, const char *right) {
506 return strncmp(left.get(), right, left.size()) < 0;
507 }
508
operator ==(const intern_string_t & left,const string_fragment & sf)509 inline bool operator==(const intern_string_t &left, const string_fragment &sf) {
510 return ((int) left.size() == sf.length()) &&
511 (memcmp(left.get(), sf.data(), left.size()) == 0);
512 }
513
operator ==(const string_fragment & left,const intern_string_t & right)514 inline bool operator==(const string_fragment &left, const intern_string_t &right) {
515 return (left.length() == (int) right.size()) &&
516 (memcmp(left.data(), right.get(), left.length()) == 0);
517 }
518
519 namespace std {
to_string(const string_fragment & s)520 inline string to_string(const string_fragment &s) {
521 return {s.data(), (size_t) s.length()};
522 }
523
to_string(const intern_string_t & s)524 inline string to_string(const intern_string_t &s) {
525 return s.to_string();
526 }
527 }
528
to_string_fragment(const string_fragment & s)529 inline string_fragment to_string_fragment(const string_fragment &s) {
530 return s;
531 }
532
to_string_fragment(const intern_string_t & s)533 inline string_fragment to_string_fragment(const intern_string_t &s) {
534 return string_fragment(s.get(), 0, s.size());
535 }
536
to_string_fragment(const std::string & s)537 inline string_fragment to_string_fragment(const std::string &s) {
538 return string_fragment(s.c_str(), 0, s.length());
539 }
540
541 #endif
542