1 /*
2  * SPDX-FileCopyrightText: 2015-2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  *
6  */
7 #ifndef _FCITX_UTILS_UTF8_H_
8 #define _FCITX_UTILS_UTF8_H_
9 
10 /// \addtogroup FcitxUtils
11 /// \{
12 /// \file
13 /// \brief C++ Utility functions for handling utf8 strings.
14 
15 #include <stdexcept>
16 #include <string>
17 #include <fcitx-utils/cutf8.h>
18 #include <fcitx-utils/misc.h>
19 #include "fcitxutils_export.h"
20 
21 namespace fcitx {
22 namespace utf8 {
23 
24 /// \brief Return the number UTF-8 characters in the string iterator range.
25 /// \see lengthValidated()
26 template <typename Iter>
length(Iter start,Iter end)27 inline size_t length(Iter start, Iter end) {
28     return fcitx_utf8_strnlen(&(*start), std::distance(start, end));
29 }
30 
31 /// \brief Return the number UTF-8 characters in the string.
32 /// \see lengthValidated()
33 template <typename T>
length(const T & s)34 inline size_t length(const T &s) {
35     return length(std::begin(s), std::end(s));
36 }
37 
38 /// \brief Return the number UTF-8 characters in the string.
39 template <typename T>
length(const T & s,size_t start,size_t end)40 inline size_t length(const T &s, size_t start, size_t end) {
41     return length(std::next(std::begin(s), start),
42                   std::next(std::begin(s), end));
43 }
44 
45 /// \brief Possible return value of lengthValidated if the string is not valid.
46 /// \see lengthValidated()
47 constexpr size_t INVALID_LENGTH = static_cast<size_t>(-1);
48 
49 /// \brief Validate and return the number UTF-8 characters in the string
50 /// iterator range
51 ///
52 /// Will return INVALID_LENGTH if string is not a valid utf8 string.
53 template <typename Iter>
lengthValidated(Iter start,Iter end)54 inline size_t lengthValidated(Iter start, Iter end) {
55     return fcitx_utf8_strnlen_validated(&(*start), std::distance(start, end));
56 }
57 
58 /// \brief Validate and return the number UTF-8 characters in the string
59 ///
60 /// Will return INVALID_LENGTH if string is not a valid utf8 string.
61 template <typename T>
lengthValidated(const T & s)62 inline size_t lengthValidated(const T &s) {
63     return lengthValidated(std::begin(s), std::end(s));
64 }
65 
66 /// \brief Check if the string iterator range is valid utf8 string
67 template <typename Iter>
validate(Iter start,Iter end)68 inline bool validate(Iter start, Iter end) {
69     return lengthValidated(start, end) != INVALID_LENGTH;
70 }
71 
72 /// \brief Check if the string is valid utf8 string.
73 template <typename T>
validate(const T & s)74 inline bool validate(const T &s) {
75     return validate(std::begin(s), std::end(s));
76 }
77 
78 /// \brief Convert UCS4 to UTF8 string.
79 FCITXUTILS_EXPORT std::string UCS4ToUTF8(uint32_t code);
80 
81 /// \brief Check if a ucs4 is valid.
82 FCITXUTILS_EXPORT bool UCS4IsValid(uint32_t code);
83 
84 /// \brief Possible return value for getChar.
85 constexpr uint32_t INVALID_CHAR = static_cast<uint32_t>(-1);
86 
87 /// \brief Possible return value for getChar.
88 constexpr uint32_t NOT_ENOUGH_SPACE = static_cast<uint32_t>(-2);
89 
90 /// \brief Check the chr value is not two invalid value above.
isValidChar(uint32_t c)91 inline bool isValidChar(uint32_t c) {
92     return c != INVALID_CHAR && c != NOT_ENOUGH_SPACE;
93 }
94 
95 /// \brief Get next UCS4 char from iter, do not cross end. May return
96 /// INVALID_CHAR or NOT_ENOUGH_SPACE
97 template <typename Iter>
getChar(Iter iter,Iter end)98 inline uint32_t getChar(Iter iter, Iter end) {
99     const char *c = &(*iter);
100     return fcitx_utf8_get_char_validated(c, std::distance(iter, end), nullptr);
101 }
102 
103 /// \brief Get next UCS4 char, may return INVALID_CHAR or NOT_ENOUGH_SPACE
104 template <typename T>
getChar(const T & s)105 inline uint32_t getChar(const T &s) {
106     return getChar(std::begin(s), std::end(s));
107 }
108 
109 template <typename Iter>
getNextChar(Iter iter,Iter end,uint32_t * chr)110 inline Iter getNextChar(Iter iter, Iter end, uint32_t *chr) {
111     const char *c = &(*iter);
112     int plen = 0;
113     *chr = fcitx_utf8_get_char_validated(c, std::distance(iter, end), &plen);
114     return std::next(iter, plen);
115 }
116 
117 /// \brief get the byte length of next N utf-8 character.
118 ///
119 /// This function has no error check on invalid string or end of string. Check
120 /// the string before use it.
121 template <typename Iter>
ncharByteLength(Iter iter,size_t n)122 inline int ncharByteLength(Iter iter, size_t n) {
123     const char *c = &(*iter);
124     int diff = fcitx_utf8_get_nth_char(c, n) - c;
125     return diff;
126 }
127 
128 /// \brief Move iter over next n character.
129 template <typename Iter>
nextNChar(Iter iter,size_t n)130 inline Iter nextNChar(Iter iter, size_t n) {
131     return std::next(iter, ncharByteLength(iter, n));
132 }
133 
134 /// \brief Move iter over next one character.
135 template <typename Iter>
nextChar(Iter iter)136 Iter nextChar(Iter iter) {
137     return nextNChar(iter, 1);
138 }
139 
140 template <typename Iter>
getLastChar(Iter iter,Iter end)141 uint32_t getLastChar(Iter iter, Iter end) {
142     uint32_t c = NOT_ENOUGH_SPACE;
143     while (iter != end) {
144         iter = getNextChar(iter, end, &c);
145         if (!isValidChar(c)) {
146             break;
147         }
148     }
149     return c;
150 }
151 
152 template <typename T>
getLastChar(const T & str)153 uint32_t getLastChar(const T &str) {
154     return getLastChar(std::begin(str), std::end(str));
155 }
156 
157 /// \brief Helper class to iterate character, you need to validate the string
158 /// before using it.
159 template <typename Iter>
160 class UTF8CharIterator {
161 public:
162     typedef std::input_iterator_tag iterator_category;
163     typedef uint32_t value_type;
164     typedef std::ptrdiff_t difference_type;
165     typedef const value_type &reference;
166     typedef const value_type *pointer;
167 
UTF8CharIterator(Iter iter,Iter end)168     UTF8CharIterator(Iter iter, Iter end) : iter_(iter), end_(end) { update(); }
FCITX_INLINE_DEFINE_DEFAULT_DTOR_AND_COPY(UTF8CharIterator)169     FCITX_INLINE_DEFINE_DEFAULT_DTOR_AND_COPY(UTF8CharIterator)
170 
171     reference operator*() const { return currentChar_; }
172 
173     pointer operator->() const { return &currentChar_; }
174 
charRange()175     std::pair<Iter, Iter> charRange() const { return {iter_, next_}; }
176 
charLength()177     size_t charLength() const { return std::distance(iter_, next_); }
178 
view()179     std::string_view view() const {
180         return std::string_view{&*iter_, charLength()};
181     }
182 
183     UTF8CharIterator &operator++() {
184         iter_ = next_;
185         update();
186         return *this;
187     }
188 
189     UTF8CharIterator operator++(int) {
190         auto old = *this;
191         ++(*this);
192         return old;
193     }
194 
195     bool operator==(const UTF8CharIterator &other) {
196         return iter_ == other.iter_;
197     }
198     bool operator!=(const UTF8CharIterator &other) {
199         return !operator==(other);
200     }
201 
202 private:
update()203     void update() {
204         next_ = getNextChar(iter_, end_, &currentChar_);
205         if (iter_ != end_ && iter_ == next_) {
206             throw std::runtime_error("Invalid UTF8 character.");
207         }
208     }
209 
210     uint32_t currentChar_ = 0;
211     Iter iter_;
212     Iter next_;
213     Iter end_;
214 };
215 
216 template <typename Iter>
MakeUTF8CharIterator(Iter iter,Iter end)217 auto MakeUTF8CharIterator(Iter iter, Iter end) {
218     return UTF8CharIterator<Iter>(iter, end);
219 }
220 
221 template <typename T>
MakeUTF8CharRange(const T & str)222 auto MakeUTF8CharRange(const T &str) {
223     return MakeIterRange(MakeUTF8CharIterator(std::begin(str), std::end(str)),
224                          MakeUTF8CharIterator(std::end(str), std::end(str)));
225 }
226 } // namespace utf8
227 } // namespace fcitx
228 
229 #endif // _FCITX_UTILS_UTF8_H_
230