1 /*
2 * SPDX-FileCopyrightText: 2015-2017 CSSlayer <wengxt@gmail.com>
3 *
4 * SPDX-License-Identifier: LGPL-2.1-or-later
5 *
6 */
7 #ifndef _FCITX_UTILS_UTF8_H_
8 #define _FCITX_UTILS_UTF8_H_
9
10 /// \addtogroup FcitxUtils
11 /// \{
12 /// \file
13 /// \brief C++ Utility functions for handling utf8 strings.
14
15 #include <stdexcept>
16 #include <string>
17 #include <fcitx-utils/cutf8.h>
18 #include <fcitx-utils/misc.h>
19 #include "fcitxutils_export.h"
20
21 namespace fcitx {
22 namespace utf8 {
23
24 /// \brief Return the number UTF-8 characters in the string iterator range.
25 /// \see lengthValidated()
26 template <typename Iter>
length(Iter start,Iter end)27 inline size_t length(Iter start, Iter end) {
28 return fcitx_utf8_strnlen(&(*start), std::distance(start, end));
29 }
30
31 /// \brief Return the number UTF-8 characters in the string.
32 /// \see lengthValidated()
33 template <typename T>
length(const T & s)34 inline size_t length(const T &s) {
35 return length(std::begin(s), std::end(s));
36 }
37
38 /// \brief Return the number UTF-8 characters in the string.
39 template <typename T>
length(const T & s,size_t start,size_t end)40 inline size_t length(const T &s, size_t start, size_t end) {
41 return length(std::next(std::begin(s), start),
42 std::next(std::begin(s), end));
43 }
44
45 /// \brief Possible return value of lengthValidated if the string is not valid.
46 /// \see lengthValidated()
47 constexpr size_t INVALID_LENGTH = static_cast<size_t>(-1);
48
49 /// \brief Validate and return the number UTF-8 characters in the string
50 /// iterator range
51 ///
52 /// Will return INVALID_LENGTH if string is not a valid utf8 string.
53 template <typename Iter>
lengthValidated(Iter start,Iter end)54 inline size_t lengthValidated(Iter start, Iter end) {
55 return fcitx_utf8_strnlen_validated(&(*start), std::distance(start, end));
56 }
57
58 /// \brief Validate and return the number UTF-8 characters in the string
59 ///
60 /// Will return INVALID_LENGTH if string is not a valid utf8 string.
61 template <typename T>
lengthValidated(const T & s)62 inline size_t lengthValidated(const T &s) {
63 return lengthValidated(std::begin(s), std::end(s));
64 }
65
66 /// \brief Check if the string iterator range is valid utf8 string
67 template <typename Iter>
validate(Iter start,Iter end)68 inline bool validate(Iter start, Iter end) {
69 return lengthValidated(start, end) != INVALID_LENGTH;
70 }
71
72 /// \brief Check if the string is valid utf8 string.
73 template <typename T>
validate(const T & s)74 inline bool validate(const T &s) {
75 return validate(std::begin(s), std::end(s));
76 }
77
78 /// \brief Convert UCS4 to UTF8 string.
79 FCITXUTILS_EXPORT std::string UCS4ToUTF8(uint32_t code);
80
81 /// \brief Check if a ucs4 is valid.
82 FCITXUTILS_EXPORT bool UCS4IsValid(uint32_t code);
83
84 /// \brief Possible return value for getChar.
85 constexpr uint32_t INVALID_CHAR = static_cast<uint32_t>(-1);
86
87 /// \brief Possible return value for getChar.
88 constexpr uint32_t NOT_ENOUGH_SPACE = static_cast<uint32_t>(-2);
89
90 /// \brief Check the chr value is not two invalid value above.
isValidChar(uint32_t c)91 inline bool isValidChar(uint32_t c) {
92 return c != INVALID_CHAR && c != NOT_ENOUGH_SPACE;
93 }
94
95 /// \brief Get next UCS4 char from iter, do not cross end. May return
96 /// INVALID_CHAR or NOT_ENOUGH_SPACE
97 template <typename Iter>
getChar(Iter iter,Iter end)98 inline uint32_t getChar(Iter iter, Iter end) {
99 const char *c = &(*iter);
100 return fcitx_utf8_get_char_validated(c, std::distance(iter, end), nullptr);
101 }
102
103 /// \brief Get next UCS4 char, may return INVALID_CHAR or NOT_ENOUGH_SPACE
104 template <typename T>
getChar(const T & s)105 inline uint32_t getChar(const T &s) {
106 return getChar(std::begin(s), std::end(s));
107 }
108
109 template <typename Iter>
getNextChar(Iter iter,Iter end,uint32_t * chr)110 inline Iter getNextChar(Iter iter, Iter end, uint32_t *chr) {
111 const char *c = &(*iter);
112 int plen = 0;
113 *chr = fcitx_utf8_get_char_validated(c, std::distance(iter, end), &plen);
114 return std::next(iter, plen);
115 }
116
117 /// \brief get the byte length of next N utf-8 character.
118 ///
119 /// This function has no error check on invalid string or end of string. Check
120 /// the string before use it.
121 template <typename Iter>
ncharByteLength(Iter iter,size_t n)122 inline int ncharByteLength(Iter iter, size_t n) {
123 const char *c = &(*iter);
124 int diff = fcitx_utf8_get_nth_char(c, n) - c;
125 return diff;
126 }
127
128 /// \brief Move iter over next n character.
129 template <typename Iter>
nextNChar(Iter iter,size_t n)130 inline Iter nextNChar(Iter iter, size_t n) {
131 return std::next(iter, ncharByteLength(iter, n));
132 }
133
134 /// \brief Move iter over next one character.
135 template <typename Iter>
nextChar(Iter iter)136 Iter nextChar(Iter iter) {
137 return nextNChar(iter, 1);
138 }
139
140 template <typename Iter>
getLastChar(Iter iter,Iter end)141 uint32_t getLastChar(Iter iter, Iter end) {
142 uint32_t c = NOT_ENOUGH_SPACE;
143 while (iter != end) {
144 iter = getNextChar(iter, end, &c);
145 if (!isValidChar(c)) {
146 break;
147 }
148 }
149 return c;
150 }
151
152 template <typename T>
getLastChar(const T & str)153 uint32_t getLastChar(const T &str) {
154 return getLastChar(std::begin(str), std::end(str));
155 }
156
157 /// \brief Helper class to iterate character, you need to validate the string
158 /// before using it.
159 template <typename Iter>
160 class UTF8CharIterator {
161 public:
162 typedef std::input_iterator_tag iterator_category;
163 typedef uint32_t value_type;
164 typedef std::ptrdiff_t difference_type;
165 typedef const value_type &reference;
166 typedef const value_type *pointer;
167
UTF8CharIterator(Iter iter,Iter end)168 UTF8CharIterator(Iter iter, Iter end) : iter_(iter), end_(end) { update(); }
FCITX_INLINE_DEFINE_DEFAULT_DTOR_AND_COPY(UTF8CharIterator)169 FCITX_INLINE_DEFINE_DEFAULT_DTOR_AND_COPY(UTF8CharIterator)
170
171 reference operator*() const { return currentChar_; }
172
173 pointer operator->() const { return ¤tChar_; }
174
charRange()175 std::pair<Iter, Iter> charRange() const { return {iter_, next_}; }
176
charLength()177 size_t charLength() const { return std::distance(iter_, next_); }
178
view()179 std::string_view view() const {
180 return std::string_view{&*iter_, charLength()};
181 }
182
183 UTF8CharIterator &operator++() {
184 iter_ = next_;
185 update();
186 return *this;
187 }
188
189 UTF8CharIterator operator++(int) {
190 auto old = *this;
191 ++(*this);
192 return old;
193 }
194
195 bool operator==(const UTF8CharIterator &other) {
196 return iter_ == other.iter_;
197 }
198 bool operator!=(const UTF8CharIterator &other) {
199 return !operator==(other);
200 }
201
202 private:
update()203 void update() {
204 next_ = getNextChar(iter_, end_, ¤tChar_);
205 if (iter_ != end_ && iter_ == next_) {
206 throw std::runtime_error("Invalid UTF8 character.");
207 }
208 }
209
210 uint32_t currentChar_ = 0;
211 Iter iter_;
212 Iter next_;
213 Iter end_;
214 };
215
216 template <typename Iter>
MakeUTF8CharIterator(Iter iter,Iter end)217 auto MakeUTF8CharIterator(Iter iter, Iter end) {
218 return UTF8CharIterator<Iter>(iter, end);
219 }
220
221 template <typename T>
MakeUTF8CharRange(const T & str)222 auto MakeUTF8CharRange(const T &str) {
223 return MakeIterRange(MakeUTF8CharIterator(std::begin(str), std::end(str)),
224 MakeUTF8CharIterator(std::end(str), std::end(str)));
225 }
226 } // namespace utf8
227 } // namespace fcitx
228
229 #endif // _FCITX_UTILS_UTF8_H_
230