1 /**
2  * Copyright (c) 2014, Timothy Stack
3  *
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * * Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * * Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  * * Neither the name of Timothy Stack nor the names of its contributors
15  * may be used to endorse or promote products derived from this software
16  * without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * @file intern_string.hh
30  */
31 
32 #ifndef intern_string_hh
33 #define intern_string_hh
34 
35 #include <string.h>
36 #include <sys/types.h>
37 
38 #include <string>
39 
40 #include "optional.hpp"
41 #include "strnatcmp.h"
42 #include "fmt/format.h"
43 
44 struct string_fragment {
45     using iterator = const char *;
46 
string_fragmentstring_fragment47     explicit string_fragment(const char *str = "", int begin = 0, int end = -1)
48         : sf_string(str), sf_begin(begin), sf_end(end == -1 ? strlen(str) : end) {
49     };
50 
string_fragmentstring_fragment51     explicit string_fragment(const unsigned char *str, int begin = 0, int end = -1)
52         : sf_string((const char *) str),
53           sf_begin(begin),
54           sf_end(end == -1 ? strlen((const char *) str) : end) {
55     };
56 
string_fragmentstring_fragment57     string_fragment(const std::string &str)
58         : sf_string(str.c_str()), sf_begin(0), sf_end(str.length()) {
59 
60     }
61 
is_validstring_fragment62     bool is_valid() const {
63         return this->sf_begin != -1;
64     };
65 
lengthstring_fragment66     int length() const {
67         return this->sf_end - this->sf_begin;
68     };
69 
datastring_fragment70     const char *data() const {
71         return &this->sf_string[this->sf_begin];
72     }
73 
beginstring_fragment74     iterator begin() const {
75         return &this->sf_string[this->sf_begin];
76     }
77 
endstring_fragment78     iterator end() const {
79         return &this->sf_string[this->sf_end];
80     }
81 
emptystring_fragment82     bool empty() const {
83         return !this->is_valid() || length() == 0;
84     };
85 
operator []string_fragment86     char operator[](int index) const {
87         return this->sf_string[sf_begin + index];
88     };
89 
operator ==string_fragment90     bool operator==(const std::string &str) const {
91         if (this->length() != (int) str.length()) {
92             return false;
93         }
94 
95         return memcmp(&this->sf_string[this->sf_begin],
96                       str.c_str(),
97                       str.length()) == 0;
98     };
99 
operator ==string_fragment100     bool operator==(const string_fragment &sf) const {
101         if (this->length() != sf.length()) {
102             return false;
103         }
104 
105         return memcmp(this->data(), sf.data(), sf.length()) == 0;
106     };
107 
iequalstring_fragment108     bool iequal(const string_fragment &sf) const {
109         if (this->length() != sf.length()) {
110             return false;
111         }
112 
113         return strnatcasecmp(this->length(), this->data(),
114             sf.length(), sf.data()) == 0;
115     };
116 
operator ==string_fragment117     bool operator==(const char *str) const {
118         size_t len = strlen(str);
119 
120         return len == (size_t) this->length() &&
121                strncmp(this->data(), str, this->length()) == 0;
122     };
123 
operator !=string_fragment124     bool operator!=(const char *str) const {
125         return !(*this == str);
126     }
127 
startswithstring_fragment128     bool startswith(const char *prefix) const {
129         auto iter = this->begin();
130 
131         while (*prefix != '\0' && *prefix == *iter && iter < this->end()) {
132             prefix += 1;
133             iter += 1;
134         }
135 
136         return *prefix == '\0';
137     }
138 
substrstring_fragment139     string_fragment substr(int begin) const {
140         return string_fragment{
141             this->sf_string,
142             this->sf_begin + begin,
143             this->sf_end
144         };
145     }
146 
findstring_fragment147     nonstd::optional<size_t> find(char ch) const {
148         for (int lpc = this->sf_begin; lpc < this->sf_end; lpc++) {
149             if (this->sf_string[lpc] == ch) {
150                 return lpc;
151             }
152         }
153 
154         return nonstd::nullopt;
155     }
156 
157     template<typename P>
consumestring_fragment158     nonstd::optional<string_fragment> consume(P predicate) const {
159         int consumed = 0;
160         while (consumed < this->length()) {
161             if (!predicate(this->data()[consumed])) {
162                 break;
163             }
164 
165             consumed += 1;
166         }
167 
168         if (consumed == 0) {
169             return nonstd::nullopt;
170         }
171 
172         return string_fragment{
173             this->sf_string,
174             this->sf_begin + consumed,
175             this->sf_end,
176         };
177     }
178 
consume_nstring_fragment179     nonstd::optional<string_fragment> consume_n(int amount) const {
180         if (amount > this->length()) {
181             return nonstd::nullopt;
182         }
183 
184         return string_fragment{
185             this->sf_string,
186             this->sf_begin + amount,
187             this->sf_end,
188         };
189     }
190 
191     template<typename P>
skipstring_fragment192     string_fragment skip(P predicate) const {
193         int offset = 0;
194         while (offset < this->length() && predicate(this->data()[offset])) {
195             offset += 1;
196         }
197 
198         return string_fragment{
199             this->sf_string,
200             this->sf_begin + offset,
201             this->sf_end,
202         };
203     }
204 
205     using split_result = nonstd::optional<std::pair<string_fragment, string_fragment>>;
206 
207     template<typename P>
split_whilestring_fragment208     split_result split_while(P& predicate) const {
209         int consumed = 0;
210         while (consumed < this->length()) {
211             if (!predicate(this->data()[consumed])) {
212                 break;
213             }
214 
215             consumed += 1;
216         }
217 
218         if (consumed == 0) {
219             return nonstd::nullopt;
220         }
221 
222         return std::make_pair(
223             string_fragment{
224                 this->sf_string,
225                 this->sf_begin,
226                 this->sf_begin + consumed,
227             },
228             string_fragment{
229                 this->sf_string,
230                 this->sf_begin + consumed,
231                 this->sf_end,
232             }
233         );
234     }
235 
236     struct tag1 {
237         const char t_value;
238 
operator ()string_fragment::tag1239         bool operator()(char ch) const {
240             return this->t_value == ch;
241         }
242     };
243 
244     struct quoted_string_body {
245         bool qs_in_escape{false};
246 
operator ()string_fragment::quoted_string_body247         bool operator()(char ch) {
248             if (this->qs_in_escape) {
249                 this->qs_in_escape = false;
250                 return true;
251             } else if (ch == '\\') {
252                 this->qs_in_escape = true;
253                 return true;
254             } else if (ch == '"') {
255                 return false;
256             } else {
257                 return true;
258             }
259         }
260     };
261 
to_stringstring_fragment262     const char *to_string(char *buf) const {
263         memcpy(buf, this->data(), this->length());
264         buf[this->length()] = '\0';
265 
266         return buf;
267     };
268 
to_stringstring_fragment269     std::string to_string() const {
270         return {this->data(), (size_t) this->length()};
271     }
272 
clearstring_fragment273     void clear() {
274         this->sf_begin = 0;
275         this->sf_end = 0;
276     };
277 
invalidatestring_fragment278     void invalidate() {
279         this->sf_begin = -1;
280         this->sf_end = -1;
281     };
282 
trimstring_fragment283     void trim(const char *tokens) {
284         while (this->sf_begin < this->sf_end) {
285             bool found = false;
286 
287             for (int lpc = 0; tokens[lpc] != '\0'; lpc++) {
288                 if (this->sf_string[this->sf_begin] == tokens[lpc]) {
289                     found = true;
290                     break;
291                 }
292             }
293             if (!found) {
294                 break;
295             }
296 
297             this->sf_begin += 1;
298         }
299         while (this->sf_begin < this->sf_end) {
300             bool found = false;
301 
302             for (int lpc = 0; tokens[lpc] != '\0'; lpc++) {
303                 if (this->sf_string[this->sf_end - 1] == tokens[lpc]) {
304                     found = true;
305                     break;
306                 }
307             }
308             if (!found) {
309                 break;
310             }
311 
312             this->sf_end -= 1;
313         }
314     }
315 
316     const char *sf_string;
317     int sf_begin;
318     int sf_end;
319 };
320 
operator <(const char * left,const string_fragment & right)321 inline bool operator<(const char *left, const string_fragment &right) {
322     int rc = strncmp(left, right.data(), right.length());
323     return rc < 0;
324 }
325 
operator <(const string_fragment & left,const char * right)326 inline bool operator<(const string_fragment &left, const char *right) {
327     return strncmp(left.data(), right, left.length()) < 0;
328 }
329 
330 class intern_string {
331 
332 public:
333     static const intern_string *lookup(const char *str, ssize_t len) noexcept;
334 
335     static const intern_string *lookup(const string_fragment &sf) noexcept;
336 
337     static const intern_string *lookup(const std::string &str) noexcept;
338 
get() const339     const char *get() const {
340         return this->is_str.c_str();
341     };
342 
size() const343     size_t size() const {
344         return this->is_str.size();
345     }
346 
to_string() const347     std::string to_string() const {
348         return this->is_str;
349     }
350 
to_string_fragment() const351     string_fragment to_string_fragment() const {
352         return string_fragment{this->is_str};
353     }
354 
startswith(const char * prefix) const355     bool startswith(const char *prefix) const {
356         const char *curr = this->is_str.data();
357 
358         while (*prefix != '\0' && *prefix == *curr) {
359             prefix += 1;
360             curr += 1;
361         }
362 
363         return *prefix == '\0';
364     }
365 
366     struct intern_table;
367     static std::shared_ptr<intern_table> get_table_lifetime();
368 private:
369     friend intern_table;
370 
intern_string(const char * str,ssize_t len)371     intern_string(const char *str, ssize_t len)
372             : is_next(nullptr), is_str(str, (size_t) len) {
373     }
374 
375     intern_string *is_next;
376     std::string is_str;
377 };
378 
379 using intern_table_lifetime = std::shared_ptr<intern_string::intern_table>;
380 
381 class intern_string_t {
382 public:
383     using iterator = const char *;
384 
intern_string_t(const intern_string * is=nullptr)385     intern_string_t(const intern_string *is = nullptr) : ist_interned_string(is) {
386 
387     }
388 
unwrap() const389     const intern_string *unwrap() const {
390         return this->ist_interned_string;
391     }
392 
clear()393     void clear() {
394         this->ist_interned_string = nullptr;
395     };
396 
empty() const397     bool empty() const {
398         return this->ist_interned_string == nullptr;
399     }
400 
get() const401     const char *get() const {
402         if (this->empty()) {
403             return "";
404         }
405         return this->ist_interned_string->get();
406     }
407 
begin() const408     iterator begin() const {
409         return this->get();
410     }
411 
end() const412     iterator end() const {
413         return this->get() + this->size();
414     }
415 
size() const416     size_t size() const {
417         if (this->ist_interned_string == nullptr) {
418             return 0;
419         }
420         return this->ist_interned_string->size();
421     }
422 
hash() const423     size_t hash() const {
424         auto ptr = (uintptr_t) this->ist_interned_string;
425 
426         return ptr;
427     }
428 
to_string() const429     std::string to_string() const {
430         if (this->ist_interned_string == nullptr) {
431             return "";
432         }
433         return this->ist_interned_string->to_string();
434     }
435 
to_string_fragment() const436     string_fragment to_string_fragment() const {
437         if (this->ist_interned_string == nullptr) {
438             return string_fragment{"", 0, 0};
439         }
440         return this->ist_interned_string->to_string_fragment();
441     }
442 
operator <(const intern_string_t & rhs) const443     bool operator<(const intern_string_t &rhs) const {
444         return strcmp(this->get(), rhs.get()) < 0;
445     }
446 
operator ==(const intern_string_t & rhs) const447     bool operator==(const intern_string_t &rhs) const {
448         return this->ist_interned_string == rhs.ist_interned_string;
449     }
450 
operator !=(const intern_string_t & rhs) const451     bool operator!=(const intern_string_t &rhs) const {
452         return !(*this == rhs);
453     }
454 
operator ==(const char * rhs) const455     bool operator==(const char *rhs) const {
456         return strcmp(this->get(), rhs) == 0;
457     }
458 
operator !=(const char * rhs) const459     bool operator!=(const char *rhs) const {
460         return strcmp(this->get(), rhs) != 0;
461     }
462 
463 private:
464     const intern_string *ist_interned_string;
465 };
466 
467 unsigned long hash_str(const char *str, size_t len);
468 
469 namespace fmt {
470 template<>
471 struct formatter<string_fragment> : formatter<string_view> {
472     template<typename FormatContext>
formatfmt::formatter473     auto format(const string_fragment &sf, FormatContext &ctx)
474     {
475         return formatter<string_view>::format(
476             string_view{sf.data(), (size_t) sf.length()}, ctx);
477     }
478 };
479 
480 template<>
481 struct formatter<intern_string_t> : formatter<string_view> {
482     template<typename FormatContext>
formatfmt::formatter483     auto format(const intern_string_t& is, FormatContext &ctx)
484     {
485         return formatter<string_view>::format(
486             string_view{is.get(), (size_t) is.size()}, ctx);
487     }
488 };
489 }
490 
491 namespace std {
492     template <>
493     struct hash<const intern_string_t> {
operator ()std::hash494         std::size_t operator()(const intern_string_t &ist) const {
495             return ist.hash();
496         }
497     };
498 }
499 
operator <(const char * left,const intern_string_t & right)500 inline bool operator<(const char *left, const intern_string_t &right) {
501     int rc = strncmp(left, right.get(), right.size());
502     return rc < 0;
503 }
504 
operator <(const intern_string_t & left,const char * right)505 inline bool operator<(const intern_string_t &left, const char *right) {
506     return strncmp(left.get(), right, left.size()) < 0;
507 }
508 
operator ==(const intern_string_t & left,const string_fragment & sf)509 inline bool operator==(const intern_string_t &left, const string_fragment &sf) {
510     return ((int) left.size() == sf.length()) &&
511            (memcmp(left.get(), sf.data(), left.size()) == 0);
512 }
513 
operator ==(const string_fragment & left,const intern_string_t & right)514 inline bool operator==(const string_fragment &left, const intern_string_t &right) {
515     return (left.length() == (int) right.size()) &&
516            (memcmp(left.data(), right.get(), left.length()) == 0);
517 }
518 
519 namespace std {
to_string(const string_fragment & s)520     inline string to_string(const string_fragment &s) {
521         return {s.data(), (size_t) s.length()};
522     }
523 
to_string(const intern_string_t & s)524     inline string to_string(const intern_string_t &s) {
525         return s.to_string();
526     }
527 }
528 
to_string_fragment(const string_fragment & s)529 inline string_fragment to_string_fragment(const string_fragment &s) {
530     return s;
531 }
532 
to_string_fragment(const intern_string_t & s)533 inline string_fragment to_string_fragment(const intern_string_t &s) {
534     return string_fragment(s.get(), 0, s.size());
535 }
536 
to_string_fragment(const std::string & s)537 inline string_fragment to_string_fragment(const std::string &s) {
538     return string_fragment(s.c_str(), 0, s.length());
539 }
540 
541 #endif
542