1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef SRC_STRING_SEARCH_H_
6 #define SRC_STRING_SEARCH_H_
7 
8 #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
9 
10 #include "node_internals.h"
11 #include <string.h>
12 #include <algorithm>
13 
14 namespace node {
15 namespace stringsearch {
16 
17 template <typename T>
18 class Vector {
19  public:
Vector(T * data,size_t length,bool isForward)20   Vector(T* data, size_t length, bool isForward)
21       : start_(data), length_(length), is_forward_(isForward) {
22     CHECK(length > 0 && data != nullptr);
23   }
24 
25   // Returns the start of the memory range.
26   // For vector v this is NOT necessarily &v[0], see forward().
start()27   const T* start() const { return start_; }
28 
29   // Returns the length of the vector, in characters.
length()30   size_t length() const { return length_; }
31 
32   // Returns true if the Vector is front-to-back, false if back-to-front.
33   // In the latter case, v[0] corresponds to the *end* of the memory range.
forward()34   size_t forward() const { return is_forward_; }
35 
36   // Access individual vector elements - checks bounds in debug mode.
37   T& operator[](size_t index) const {
38 #ifdef DEBUG
39     CHECK(index < length_);
40 #endif
41     return start_[is_forward_ ? index : (length_ - index - 1)];
42   }
43 
44  private:
45   T* start_;
46   size_t length_;
47   bool is_forward_;
48 };
49 
50 
51 //---------------------------------------------------------------------
52 // String Search object.
53 //---------------------------------------------------------------------
54 
55 // Class holding constants and methods that apply to all string search variants,
56 // independently of subject and pattern char size.
57 class StringSearchBase {
58  protected:
59   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
60   // limit, we can fix the size of tables. For a needle longer than this limit,
61   // search will not be optimal, since we only build tables for a suffix
62   // of the string, but it is a safe approximation.
63   static const int kBMMaxShift = 250;
64 
65   // Reduce alphabet to this size.
66   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
67   // proportional to the input alphabet. We reduce the alphabet size by
68   // equating input characters modulo a smaller alphabet size. This gives
69   // a potentially less efficient searching, but is a safe approximation.
70   // For needles using only characters in the same Unicode 256-code point page,
71   // there is no search speed degradation.
72   static const int kLatin1AlphabetSize = 256;
73   static const int kUC16AlphabetSize = 256;
74 
75   // Bad-char shift table stored in the state. It's length is the alphabet size.
76   // For patterns below this length, the skip length of Boyer-Moore is too short
77   // to compensate for the algorithmic overhead compared to simple brute force.
78   static const int kBMMinPatternLength = 8;
79 
80   // Store for the BoyerMoore(Horspool) bad char shift table.
81   int bad_char_shift_table_[kUC16AlphabetSize];
82   // Store for the BoyerMoore good suffix shift table.
83   int good_suffix_shift_table_[kBMMaxShift + 1];
84   // Table used temporarily while building the BoyerMoore good suffix
85   // shift table.
86   int suffix_table_[kBMMaxShift + 1];
87 };
88 
89 template <typename Char>
90 class StringSearch : private StringSearchBase {
91  public:
92   typedef stringsearch::Vector<const Char> Vector;
93 
StringSearch(Vector pattern)94   explicit StringSearch(Vector pattern)
95       : pattern_(pattern), start_(0) {
96     if (pattern.length() >= kBMMaxShift) {
97       start_ = pattern.length() - kBMMaxShift;
98     }
99 
100     size_t pattern_length = pattern_.length();
101     CHECK_GT(pattern_length, 0);
102     if (pattern_length < kBMMinPatternLength) {
103       if (pattern_length == 1) {
104         strategy_ = &StringSearch::SingleCharSearch;
105         return;
106       }
107       strategy_ = &StringSearch::LinearSearch;
108       return;
109     }
110     strategy_ = &StringSearch::InitialSearch;
111   }
112 
Search(Vector subject,size_t index)113   size_t Search(Vector subject, size_t index) {
114     return (this->*strategy_)(subject, index);
115   }
116 
AlphabetSize()117   static inline int AlphabetSize() {
118     if (sizeof(Char) == 1) {
119       // Latin1 needle.
120       return kLatin1AlphabetSize;
121     } else {
122       // UC16 needle.
123       return kUC16AlphabetSize;
124     }
125 
126     static_assert(sizeof(Char) == sizeof(uint8_t) ||
127                   sizeof(Char) == sizeof(uint16_t),
128                   "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)");
129   }
130 
131  private:
132   typedef size_t (StringSearch::*SearchFunction)(Vector, size_t);
133   size_t SingleCharSearch(Vector subject, size_t start_index);
134   size_t LinearSearch(Vector subject, size_t start_index);
135   size_t InitialSearch(Vector subject, size_t start_index);
136   size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index);
137   size_t BoyerMooreSearch(Vector subject, size_t start_index);
138 
139   void PopulateBoyerMooreHorspoolTable();
140 
141   void PopulateBoyerMooreTable();
142 
CharOccurrence(int * bad_char_occurrence,Char char_code)143   static inline int CharOccurrence(int* bad_char_occurrence,
144                                    Char char_code) {
145     if (sizeof(Char) == 1) {
146       return bad_char_occurrence[static_cast<int>(char_code)];
147     }
148     // Both pattern and subject are UC16. Reduce character to equivalence class.
149     int equiv_class = char_code % kUC16AlphabetSize;
150     return bad_char_occurrence[equiv_class];
151   }
152 
153   // The pattern to search for.
154   Vector pattern_;
155   // Pointer to implementation of the search.
156   SearchFunction strategy_;
157   // Cache value of Max(0, pattern_length() - kBMMaxShift)
158   size_t start_;
159 };
160 
161 
162 template <typename T, typename U>
AlignDown(T value,U alignment)163 inline T AlignDown(T value, U alignment) {
164   return reinterpret_cast<T>(
165       (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
166 }
167 
168 
GetHighestValueByte(uint16_t character)169 inline uint8_t GetHighestValueByte(uint16_t character) {
170   return std::max(static_cast<uint8_t>(character & 0xFF),
171                   static_cast<uint8_t>(character >> 8));
172 }
173 
174 
GetHighestValueByte(uint8_t character)175 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
176 
177 
178 // Searches for a byte value in a memory buffer, back to front.
179 // Uses memrchr(3) on systems which support it, for speed.
180 // Falls back to a vanilla for loop on non-GNU systems such as Windows.
MemrchrFill(const void * haystack,uint8_t needle,size_t haystack_len)181 inline const void* MemrchrFill(const void* haystack, uint8_t needle,
182                                size_t haystack_len) {
183 #ifdef _GNU_SOURCE
184   return memrchr(haystack, needle, haystack_len);
185 #else
186   const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack);
187   for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) {
188     if (haystack8[i] == needle) {
189       return haystack8 + i;
190     }
191   }
192   return nullptr;
193 #endif
194 }
195 
196 
197 // Finds the first occurrence of *two-byte* character pattern[0] in the string
198 // `subject`. Does not check that the whole pattern matches.
199 template <typename Char>
FindFirstCharacter(Vector<const Char> pattern,Vector<const Char> subject,size_t index)200 inline size_t FindFirstCharacter(Vector<const Char> pattern,
201                                  Vector<const Char> subject, size_t index) {
202   const Char pattern_first_char = pattern[0];
203   const size_t max_n = (subject.length() - pattern.length() + 1);
204 
205   // For speed, search for the more `rare` of the two bytes in pattern[0]
206   // using memchr / memrchr (which are much faster than a simple for loop).
207   const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
208   size_t pos = index;
209   do {
210     const size_t bytes_to_search = (max_n - pos) * sizeof(Char);
211     const void* void_pos;
212     if (subject.forward()) {
213       // Assert that bytes_to_search won't overflow
214       CHECK_LE(pos, max_n);
215       CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char));
216       void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search);
217     } else {
218       CHECK_LE(pos, subject.length());
219       CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char));
220       void_pos = MemrchrFill(subject.start() + pattern.length() - 1,
221                              search_byte,
222                              bytes_to_search);
223     }
224     const Char* char_pos = static_cast<const Char*>(void_pos);
225     if (char_pos == nullptr)
226       return subject.length();
227 
228     // Then, for each match, verify that the full two bytes match pattern[0].
229     char_pos = AlignDown(char_pos, sizeof(Char));
230     size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
231     pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1);
232     if (subject[pos] == pattern_first_char) {
233       // Match found, hooray.
234       return pos;
235     }
236     // Search byte matched, but the other byte of pattern[0] didn't. Keep going.
237   } while (++pos < max_n);
238 
239   return subject.length();
240 }
241 
242 
243 // Finds the first occurrence of the byte pattern[0] in string `subject`.
244 // Does not verify that the whole pattern matches.
245 template <>
FindFirstCharacter(Vector<const uint8_t> pattern,Vector<const uint8_t> subject,size_t index)246 inline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
247                                  Vector<const uint8_t> subject,
248                                  size_t index) {
249   const uint8_t pattern_first_char = pattern[0];
250   const size_t subj_len = subject.length();
251   const size_t max_n = (subject.length() - pattern.length() + 1);
252 
253   const void* pos;
254   if (subject.forward()) {
255     pos = memchr(subject.start() + index, pattern_first_char, max_n - index);
256   } else {
257     pos = MemrchrFill(subject.start() + pattern.length() - 1,
258                       pattern_first_char,
259                       max_n - index);
260   }
261   const uint8_t* char_pos = static_cast<const uint8_t*>(pos);
262   if (char_pos == nullptr) {
263     return subj_len;
264   }
265 
266   size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
267   return subject.forward() ? raw_pos : (subj_len - raw_pos - 1);
268 }
269 
270 //---------------------------------------------------------------------
271 // Single Character Pattern Search Strategy
272 //---------------------------------------------------------------------
273 
274 template <typename Char>
SingleCharSearch(Vector subject,size_t index)275 size_t StringSearch<Char>::SingleCharSearch(
276     Vector subject,
277     size_t index) {
278   CHECK_EQ(1, pattern_.length());
279   return FindFirstCharacter(pattern_, subject, index);
280 }
281 
282 //---------------------------------------------------------------------
283 // Linear Search Strategy
284 //---------------------------------------------------------------------
285 
286 // Simple linear search for short patterns. Never bails out.
287 template <typename Char>
LinearSearch(Vector subject,size_t index)288 size_t StringSearch<Char>::LinearSearch(
289     Vector subject,
290     size_t index) {
291   CHECK_GT(pattern_.length(), 1);
292   const size_t n = subject.length() - pattern_.length();
293   for (size_t i = index; i <= n; i++) {
294     i = FindFirstCharacter(pattern_, subject, i);
295     if (i == subject.length())
296       return subject.length();
297     CHECK_LE(i, n);
298 
299     bool matches = true;
300     for (size_t j = 1; j < pattern_.length(); j++) {
301       if (pattern_[j] != subject[i + j]) {
302         matches = false;
303         break;
304       }
305     }
306     if (matches) {
307       return i;
308     }
309   }
310   return subject.length();
311 }
312 
313 //---------------------------------------------------------------------
314 // Boyer-Moore string search
315 //---------------------------------------------------------------------
316 
317 template <typename Char>
BoyerMooreSearch(Vector subject,size_t start_index)318 size_t StringSearch<Char>::BoyerMooreSearch(
319     Vector subject,
320     size_t start_index) {
321   const size_t subject_length = subject.length();
322   const size_t pattern_length = pattern_.length();
323   // Only preprocess at most kBMMaxShift last characters of pattern.
324   size_t start = start_;
325 
326   int* bad_char_occurrence = bad_char_shift_table_;
327   int* good_suffix_shift = good_suffix_shift_table_ - start_;
328 
329   Char last_char = pattern_[pattern_length - 1];
330   size_t index = start_index;
331   // Continue search from i.
332   while (index <= subject_length - pattern_length) {
333     size_t j = pattern_length - 1;
334     int c;
335     while (last_char != (c = subject[index + j])) {
336       int shift = j - CharOccurrence(bad_char_occurrence, c);
337       index += shift;
338       if (index > subject_length - pattern_length) {
339         return subject.length();
340       }
341     }
342     while (pattern_[j] == (c = subject[index + j])) {
343       if (j == 0) {
344         return index;
345       }
346       j--;
347     }
348     if (j < start) {
349       // we have matched more than our tables allow us to be smart about.
350       // Fall back on BMH shift.
351       index += pattern_length - 1 -
352                CharOccurrence(bad_char_occurrence, last_char);
353     } else {
354       int gs_shift = good_suffix_shift[j + 1];
355       int bc_occ = CharOccurrence(bad_char_occurrence, c);
356       int shift = j - bc_occ;
357       if (gs_shift > shift) {
358         shift = gs_shift;
359       }
360       index += shift;
361     }
362   }
363 
364   return subject.length();
365 }
366 
367 template <typename Char>
PopulateBoyerMooreTable()368 void StringSearch<Char>::PopulateBoyerMooreTable() {
369   const size_t pattern_length = pattern_.length();
370   // Only look at the last kBMMaxShift characters of pattern (from start_
371   // to pattern_length).
372   const size_t start = start_;
373   const size_t length = pattern_length - start;
374 
375   // Biased tables so that we can use pattern indices as table indices,
376   // even if we only cover the part of the pattern from offset start.
377   int* shift_table = good_suffix_shift_table_ - start_;
378   int* suffix_table = suffix_table_ - start_;
379 
380   // Initialize table.
381   for (size_t i = start; i < pattern_length; i++) {
382     shift_table[i] = length;
383   }
384   shift_table[pattern_length] = 1;
385   suffix_table[pattern_length] = pattern_length + 1;
386 
387   if (pattern_length <= start) {
388     return;
389   }
390 
391   // Find suffixes.
392   Char last_char = pattern_[pattern_length - 1];
393   size_t suffix = pattern_length + 1;
394   {
395     size_t i = pattern_length;
396     while (i > start) {
397       Char c = pattern_[i - 1];
398       while (suffix <= pattern_length && c != pattern_[suffix - 1]) {
399         if (static_cast<size_t>(shift_table[suffix]) == length) {
400           shift_table[suffix] = suffix - i;
401         }
402         suffix = suffix_table[suffix];
403       }
404       suffix_table[--i] = --suffix;
405       if (suffix == pattern_length) {
406         // No suffix to extend, so we check against last_char only.
407         while ((i > start) && (pattern_[i - 1] != last_char)) {
408           if (static_cast<size_t>(shift_table[pattern_length]) == length) {
409             shift_table[pattern_length] = pattern_length - i;
410           }
411           suffix_table[--i] = pattern_length;
412         }
413         if (i > start) {
414           suffix_table[--i] = --suffix;
415         }
416       }
417     }
418   }
419   // Build shift table using suffixes.
420   if (suffix < pattern_length) {
421     for (size_t i = start; i <= pattern_length; i++) {
422       if (static_cast<size_t>(shift_table[i]) == length) {
423         shift_table[i] = suffix - start;
424       }
425       if (i == suffix) {
426         suffix = suffix_table[suffix];
427       }
428     }
429   }
430 }
431 
432 //---------------------------------------------------------------------
433 // Boyer-Moore-Horspool string search.
434 //---------------------------------------------------------------------
435 
436 template <typename Char>
BoyerMooreHorspoolSearch(Vector subject,size_t start_index)437 size_t StringSearch<Char>::BoyerMooreHorspoolSearch(
438     Vector subject,
439     size_t start_index) {
440   const size_t subject_length = subject.length();
441   const size_t pattern_length = pattern_.length();
442   int* char_occurrences = bad_char_shift_table_;
443   int64_t badness = -pattern_length;
444 
445   // How bad we are doing without a good-suffix table.
446   Char last_char = pattern_[pattern_length - 1];
447   int last_char_shift =
448       pattern_length - 1 -
449       CharOccurrence(char_occurrences, last_char);
450 
451   // Perform search
452   size_t index = start_index;  // No matches found prior to this index.
453   while (index <= subject_length - pattern_length) {
454     size_t j = pattern_length - 1;
455     int subject_char;
456     while (last_char != (subject_char = subject[index + j])) {
457       int bc_occ = CharOccurrence(char_occurrences, subject_char);
458       int shift = j - bc_occ;
459       index += shift;
460       badness += 1 - shift;  // at most zero, so badness cannot increase.
461       if (index > subject_length - pattern_length) {
462         return subject_length;
463       }
464     }
465     j--;
466     while (pattern_[j] == (subject[index + j])) {
467       if (j == 0) {
468         return index;
469       }
470       j--;
471     }
472     index += last_char_shift;
473     // Badness increases by the number of characters we have
474     // checked, and decreases by the number of characters we
475     // can skip by shifting. It's a measure of how we are doing
476     // compared to reading each character exactly once.
477     badness += (pattern_length - j) - last_char_shift;
478     if (badness > 0) {
479       PopulateBoyerMooreTable();
480       strategy_ = &StringSearch::BoyerMooreSearch;
481       return BoyerMooreSearch(subject, index);
482     }
483   }
484   return subject.length();
485 }
486 
487 template <typename Char>
PopulateBoyerMooreHorspoolTable()488 void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() {
489   const size_t pattern_length = pattern_.length();
490 
491   int* bad_char_occurrence = bad_char_shift_table_;
492 
493   // Only preprocess at most kBMMaxShift last characters of pattern.
494   const size_t start = start_;
495   // Run forwards to populate bad_char_table, so that *last* instance
496   // of character equivalence class is the one registered.
497   // Notice: Doesn't include the last character.
498   const size_t table_size = AlphabetSize();
499   if (start == 0) {
500     // All patterns less than kBMMaxShift in length.
501     memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
502   } else {
503     for (size_t i = 0; i < table_size; i++) {
504       bad_char_occurrence[i] = start - 1;
505     }
506   }
507   for (size_t i = start; i < pattern_length - 1; i++) {
508     Char c = pattern_[i];
509     int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize();
510     bad_char_occurrence[bucket] = i;
511   }
512 }
513 
514 //---------------------------------------------------------------------
515 // Linear string search with bailout to BMH.
516 //---------------------------------------------------------------------
517 
518 // Simple linear search for short patterns, which bails out if the string
519 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
520 template <typename Char>
InitialSearch(Vector subject,size_t index)521 size_t StringSearch<Char>::InitialSearch(
522     Vector subject,
523     size_t index) {
524   const size_t pattern_length = pattern_.length();
525   // Badness is a count of how much work we have done.  When we have
526   // done enough work we decide it's probably worth switching to a better
527   // algorithm.
528   int64_t badness = -10 - (pattern_length << 2);
529 
530   // We know our pattern is at least 2 characters, we cache the first so
531   // the common case of the first character not matching is faster.
532   for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
533     badness++;
534     if (badness <= 0) {
535       i = FindFirstCharacter(pattern_, subject, i);
536       if (i == subject.length())
537         return subject.length();
538       CHECK_LE(i, n);
539       size_t j = 1;
540       do {
541         if (pattern_[j] != subject[i + j]) {
542           break;
543         }
544         j++;
545       } while (j < pattern_length);
546       if (j == pattern_length) {
547         return i;
548       }
549       badness += j;
550     } else {
551       PopulateBoyerMooreHorspoolTable();
552       strategy_ = &StringSearch::BoyerMooreHorspoolSearch;
553       return BoyerMooreHorspoolSearch(subject, i);
554     }
555   }
556   return subject.length();
557 }
558 
559 // Perform a single stand-alone search.
560 // If searching multiple times for the same pattern, a search
561 // object should be constructed once and the Search function then called
562 // for each search.
563 template <typename Char>
SearchString(Vector<const Char> subject,Vector<const Char> pattern,size_t start_index)564 size_t SearchString(Vector<const Char> subject,
565                     Vector<const Char> pattern,
566                     size_t start_index) {
567   StringSearch<Char> search(pattern);
568   return search.Search(subject, start_index);
569 }
570 }  // namespace stringsearch
571 }  // namespace node
572 
573 namespace node {
574 
575 template <typename Char>
SearchString(const Char * haystack,size_t haystack_length,const Char * needle,size_t needle_length,size_t start_index,bool is_forward)576 size_t SearchString(const Char* haystack,
577                     size_t haystack_length,
578                     const Char* needle,
579                     size_t needle_length,
580                     size_t start_index,
581                     bool is_forward) {
582   if (haystack_length < needle_length) return haystack_length;
583   // To do a reverse search (lastIndexOf instead of indexOf) without redundant
584   // code, create two vectors that are reversed views into the input strings.
585   // For example, v_needle[0] would return the *last* character of the needle.
586   // So we're searching for the first instance of rev(needle) in rev(haystack)
587   stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward);
588   stringsearch::Vector<const Char> v_haystack(
589       haystack, haystack_length, is_forward);
590   size_t diff = haystack_length - needle_length;
591   size_t relative_start_index;
592   if (is_forward) {
593     relative_start_index = start_index;
594   } else if (diff < start_index) {
595     relative_start_index = 0;
596   } else {
597     relative_start_index = diff - start_index;
598   }
599   size_t pos = node::stringsearch::SearchString(
600       v_haystack, v_needle, relative_start_index);
601   if (pos == haystack_length) {
602     // not found
603     return pos;
604   }
605   return is_forward ? pos : (haystack_length - needle_length - pos);
606 }
607 
608 template <size_t N>
SearchString(const char * haystack,size_t haystack_length,const char (& needle)[N])609 size_t SearchString(const char* haystack, size_t haystack_length,
610                     const char (&needle)[N]) {
611   return SearchString(
612       reinterpret_cast<const uint8_t*>(haystack), haystack_length,
613       reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true);
614 }
615 
616 }  // namespace node
617 
618 #endif  // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
619 
620 #endif  // SRC_STRING_SEARCH_H_
621