1 // Copyright 2019 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef CHROME_COMMON_STRING_MATCHING_TERM_BREAK_ITERATOR_H_
6 #define CHROME_COMMON_STRING_MATCHING_TERM_BREAK_ITERATOR_H_
7 
8 #include <stddef.h>
9 
10 #include <memory>
11 
12 #include "base/macros.h"
13 #include "base/strings/string16.h"
14 
15 namespace base {
16 namespace i18n {
17 class UTF16CharIterator;
18 }
19 }  // namespace base
20 
21 // TermBreakIterator breaks terms out of a word. Terms are broken on
22 // camel case boundaries and alpha/number boundaries. Numbers are defined
23 // as [0-9\.,]+.
24 //  e.g.
25 //   CamelCase -> Camel, Case
26 //   Python2.7 -> Python, 2.7
27 class TermBreakIterator {
28  public:
29   // Note that |word| must out live this iterator.
30   explicit TermBreakIterator(const base::string16& word);
31   ~TermBreakIterator();
32 
33   // Advance to the next term. Returns false if at the end of the word.
34   bool Advance();
35 
36   // Returns the current term, which is the substr of |word_| in range
37   // [prev_, pos_).
38   const base::string16 GetCurrentTerm() const;
39 
prev()40   size_t prev() const { return prev_; }
pos()41   size_t pos() const { return pos_; }
42 
43   static const size_t npos = static_cast<size_t>(-1);
44 
45  private:
46   enum State {
47     STATE_START,   // Initial state
48     STATE_NUMBER,  // Current char is a number [0-9\.,].
49     STATE_UPPER,   // Current char is upper case.
50     STATE_LOWER,   // Current char is lower case.
51     STATE_CHAR,    // Current char has no case, e.g. a cjk char.
52     STATE_LAST,
53   };
54 
55   // Returns new state for given |ch|.
56   State GetNewState(base::char16 ch);
57 
58   const base::string16& word_;
59   size_t prev_;
60   size_t pos_;
61 
62   std::unique_ptr<base::i18n::UTF16CharIterator> iter_;
63   State state_;
64 
65   DISALLOW_COPY_AND_ASSIGN(TermBreakIterator);
66 };
67 
68 #endif  // CHROME_COMMON_STRING_MATCHING_TERM_BREAK_ITERATOR_H_
69