1 // Copyright (C) 2018 ycmd contributors
2 //
3 // This file is part of ycmd.
4 //
5 // ycmd is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation, either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // ycmd is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with ycmd.  If not, see <http://www.gnu.org/licenses/>.
17 
18 #ifndef CODE_POINT_H_3W0LNCLY
19 #define CODE_POINT_H_3W0LNCLY
20 
21 #include <stdexcept>
22 #include <string>
23 #include <vector>
24 
25 namespace YouCompleteMe {
26 
27 // See
28 // http://www.unicode.org/reports/tr29/tr29-37.html#Grapheme_Cluster_Break_Property_Values
29 // NOTE: The properties must take the same value as the ones defined in the
30 // update_unicode.py script.
31 enum class BreakProperty : uint8_t {
32   OTHER              =  0,
33   CR                 =  1,
34   LF                 =  2,
35   CONTROL            =  3,
36   EXTEND             =  4,
37   ZWJ                =  5,
38   REGIONAL_INDICATOR =  6,
39   PREPEND            =  7,
40   SPACINGMARK        =  8,
41   L                  =  9,
42   V                  = 10,
43   T                  = 11,
44   LV                 = 12,
45   LVT                = 13,
46   EXTPICT            = 18
47 };
48 
49 
50 // This is the structure used to store the data in the Unicode table. See the
51 // CodePoint class for a description of the members.
52 struct RawCodePoint {
53   std::string_view original;
54   std::string_view normal;
55   std::string_view folded_case;
56   std::string_view swapped_case;
57   bool is_letter;
58   bool is_punctuation;
59   bool is_uppercase;
60   uint8_t break_property;
61   uint8_t combining_class;
62 };
63 
64 
65 // This class represents a UTF-8 code point. It takes a UTF-8 encoded string
66 // corresponding to a UTF-8 code point and compute the following properties
67 // from a Unicode table:
68 //  - the UTF-8 code point itself;
69 //  - its normalized version: two code points (or sequence of code points)
70 //    represent the same character if they have identical normalized version;
71 //  - its case-folded version: identical to the normalized version if the code
72 //    point is caseless;
73 //  - its case-swapped version: lowercase if the code point is uppercase,
74 //    uppercase if the code point is lowercase, identical to the normalized
75 //    version if the code point is caseless;
76 //  - if the code point is a letter;
77 //  - if the code point is a punctuation;
78 //  - if the code point is in uppercase: false if the code point has no
79 //    uppercase version;
80 //  - its breaking property: used to split a word into characters.
81 //  - its combining class: used to sort a sequence of code points according to
82 //    the Canonical Ordering algorithm (see
83 //    https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G49591).
84 class CodePoint {
85 public:
86   YCM_EXPORT explicit CodePoint( std::string_view code_point );
87   // Make class noncopyable
88   CodePoint( const CodePoint& ) = delete;
89   CodePoint& operator=( const CodePoint& ) = delete;
90   CodePoint( CodePoint&& ) = default;
91   CodePoint& operator=( CodePoint&& ) = default;
92 
Normal()93   inline std::string Normal() const {
94     return normal_;
95   }
96 
FoldedCase()97   inline std::string FoldedCase() const {
98     return folded_case_;
99   }
100 
SwappedCase()101   inline std::string SwappedCase() const {
102     return swapped_case_;
103   }
104 
IsLetter()105   inline bool IsLetter() const {
106     return is_letter_;
107   }
108 
IsPunctuation()109   inline bool IsPunctuation() const {
110     return is_punctuation_;
111   }
112 
IsUppercase()113   inline bool IsUppercase() const {
114     return is_uppercase_;
115   }
116 
GetBreakProperty()117   inline BreakProperty GetBreakProperty() const {
118     return break_property_;
119   }
120 
CombiningClass()121   inline uint8_t CombiningClass() const {
122     return combining_class_;
123   }
124 
125   inline bool operator< ( const CodePoint &other ) const {
126     return combining_class_ < other.combining_class_;
127   }
128 
129 private:
130   explicit CodePoint( RawCodePoint&& code_point );
131 
132   std::string normal_;
133   std::string folded_case_;
134   std::string swapped_case_;
135   bool is_letter_;
136   bool is_punctuation_;
137   bool is_uppercase_;
138   BreakProperty break_property_;
139   uint8_t combining_class_;
140 };
141 
142 
143 using CodePointSequence = std::vector< const CodePoint * >;
144 
145 
146 // Split a UTF-8 encoded string into UTF-8 code points.
147 YCM_EXPORT CodePointSequence BreakIntoCodePoints( std::string_view text );
148 
149 
150 // Thrown when an error occurs while decoding a UTF-8 string.
151 struct YCM_EXPORT UnicodeDecodeError : std::runtime_error {
152   using std::runtime_error::runtime_error;
153   const char* what() const noexcept override;
154 };
155 
156 } // namespace YouCompleteMe
157 
158 #endif /* end of include guard: CODE_POINT_H_3W0LNCLY */
159