1 // Copyright (C) 2018 ycmd contributors 2 // 3 // This file is part of ycmd. 4 // 5 // ycmd is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // ycmd is distributed in the hope that it will be useful, 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU General Public License for more details. 14 // 15 // You should have received a copy of the GNU General Public License 16 // along with ycmd. If not, see <http://www.gnu.org/licenses/>. 17 18 #ifndef CODE_POINT_H_3W0LNCLY 19 #define CODE_POINT_H_3W0LNCLY 20 21 #include <stdexcept> 22 #include <string> 23 #include <vector> 24 25 namespace YouCompleteMe { 26 27 // See 28 // http://www.unicode.org/reports/tr29/tr29-37.html#Grapheme_Cluster_Break_Property_Values 29 // NOTE: The properties must take the same value as the ones defined in the 30 // update_unicode.py script. 31 enum class BreakProperty : uint8_t { 32 OTHER = 0, 33 CR = 1, 34 LF = 2, 35 CONTROL = 3, 36 EXTEND = 4, 37 ZWJ = 5, 38 REGIONAL_INDICATOR = 6, 39 PREPEND = 7, 40 SPACINGMARK = 8, 41 L = 9, 42 V = 10, 43 T = 11, 44 LV = 12, 45 LVT = 13, 46 EXTPICT = 18 47 }; 48 49 50 // This is the structure used to store the data in the Unicode table. See the 51 // CodePoint class for a description of the members. 52 struct RawCodePoint { 53 std::string_view original; 54 std::string_view normal; 55 std::string_view folded_case; 56 std::string_view swapped_case; 57 bool is_letter; 58 bool is_punctuation; 59 bool is_uppercase; 60 uint8_t break_property; 61 uint8_t combining_class; 62 }; 63 64 65 // This class represents a UTF-8 code point. It takes a UTF-8 encoded string 66 // corresponding to a UTF-8 code point and compute the following properties 67 // from a Unicode table: 68 // - the UTF-8 code point itself; 69 // - its normalized version: two code points (or sequence of code points) 70 // represent the same character if they have identical normalized version; 71 // - its case-folded version: identical to the normalized version if the code 72 // point is caseless; 73 // - its case-swapped version: lowercase if the code point is uppercase, 74 // uppercase if the code point is lowercase, identical to the normalized 75 // version if the code point is caseless; 76 // - if the code point is a letter; 77 // - if the code point is a punctuation; 78 // - if the code point is in uppercase: false if the code point has no 79 // uppercase version; 80 // - its breaking property: used to split a word into characters. 81 // - its combining class: used to sort a sequence of code points according to 82 // the Canonical Ordering algorithm (see 83 // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G49591). 84 class CodePoint { 85 public: 86 YCM_EXPORT explicit CodePoint( std::string_view code_point ); 87 // Make class noncopyable 88 CodePoint( const CodePoint& ) = delete; 89 CodePoint& operator=( const CodePoint& ) = delete; 90 CodePoint( CodePoint&& ) = default; 91 CodePoint& operator=( CodePoint&& ) = default; 92 Normal()93 inline std::string Normal() const { 94 return normal_; 95 } 96 FoldedCase()97 inline std::string FoldedCase() const { 98 return folded_case_; 99 } 100 SwappedCase()101 inline std::string SwappedCase() const { 102 return swapped_case_; 103 } 104 IsLetter()105 inline bool IsLetter() const { 106 return is_letter_; 107 } 108 IsPunctuation()109 inline bool IsPunctuation() const { 110 return is_punctuation_; 111 } 112 IsUppercase()113 inline bool IsUppercase() const { 114 return is_uppercase_; 115 } 116 GetBreakProperty()117 inline BreakProperty GetBreakProperty() const { 118 return break_property_; 119 } 120 CombiningClass()121 inline uint8_t CombiningClass() const { 122 return combining_class_; 123 } 124 125 inline bool operator< ( const CodePoint &other ) const { 126 return combining_class_ < other.combining_class_; 127 } 128 129 private: 130 explicit CodePoint( RawCodePoint&& code_point ); 131 132 std::string normal_; 133 std::string folded_case_; 134 std::string swapped_case_; 135 bool is_letter_; 136 bool is_punctuation_; 137 bool is_uppercase_; 138 BreakProperty break_property_; 139 uint8_t combining_class_; 140 }; 141 142 143 using CodePointSequence = std::vector< const CodePoint * >; 144 145 146 // Split a UTF-8 encoded string into UTF-8 code points. 147 YCM_EXPORT CodePointSequence BreakIntoCodePoints( std::string_view text ); 148 149 150 // Thrown when an error occurs while decoding a UTF-8 string. 151 struct YCM_EXPORT UnicodeDecodeError : std::runtime_error { 152 using std::runtime_error::runtime_error; 153 const char* what() const noexcept override; 154 }; 155 156 } // namespace YouCompleteMe 157 158 #endif /* end of include guard: CODE_POINT_H_3W0LNCLY */ 159