1 //
2 // Copyright Aliaksei Levin (levlam@telegram.org), Arseny Smirnov (arseny30@gmail.com) 2014-2021
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See accompanying
5 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6 //
7 #pragma once
8 
9 #include "td/utils/common.h"
10 #include "td/utils/Slice.h"
11 
12 namespace td {
13 
14 /// checks UTF-8 string for correctness
15 bool check_utf8(CSlice str);
16 
17 /// checks if a code unit is a first code unit of a UTF-8 character
is_utf8_character_first_code_unit(unsigned char c)18 inline bool is_utf8_character_first_code_unit(unsigned char c) {
19   return (c & 0xC0) != 0x80;
20 }
21 
22 /// returns length of UTF-8 string in characters
utf8_length(Slice str)23 inline size_t utf8_length(Slice str) {
24   size_t result = 0;
25   for (auto c : str) {
26     result += is_utf8_character_first_code_unit(c);
27   }
28   return result;
29 }
30 
31 /// returns length of UTF-8 string in UTF-16 code units
utf8_utf16_length(Slice str)32 inline size_t utf8_utf16_length(Slice str) {
33   size_t result = 0;
34   for (auto c : str) {
35     result += is_utf8_character_first_code_unit(c) + ((c & 0xf8) == 0xf0);
36   }
37   return result;
38 }
39 
40 /// appends a Unicode character using UTF-8 encoding
41 void append_utf8_character(string &str, uint32 ch);
42 
43 /// moves pointer one UTF-8 character back
prev_utf8_unsafe(const unsigned char * ptr)44 inline const unsigned char *prev_utf8_unsafe(const unsigned char *ptr) {
45   while (!is_utf8_character_first_code_unit(*--ptr)) {
46     // pass
47   }
48   return ptr;
49 }
50 
51 /// moves pointer one UTF-8 character forward and saves code of the skipped character in *code
52 const unsigned char *next_utf8_unsafe(const unsigned char *ptr, uint32 *code, const char *source);
53 
54 /// truncates UTF-8 string to the given length in Unicode characters
55 template <class T>
utf8_truncate(T str,size_t length)56 T utf8_truncate(T str, size_t length) {
57   if (str.size() > length) {
58     for (size_t i = 0; i < str.size(); i++) {
59       if (is_utf8_character_first_code_unit(static_cast<unsigned char>(str[i]))) {
60         if (length == 0) {
61           return str.substr(0, i);
62         } else {
63           length--;
64         }
65       }
66     }
67   }
68   return str;
69 }
70 
71 /// truncates UTF-8 string to the given length given in UTF-16 code units
72 template <class T>
utf8_utf16_truncate(T str,size_t length)73 T utf8_utf16_truncate(T str, size_t length) {
74   for (size_t i = 0; i < str.size(); i++) {
75     auto c = static_cast<unsigned char>(str[i]);
76     if (is_utf8_character_first_code_unit(c)) {
77       if (length <= 0) {
78         return str.substr(0, i);
79       } else {
80         length--;
81         if (c >= 0xf0) {  // >= 4 bytes in symbol => surrogate pair
82           length--;
83         }
84       }
85     }
86   }
87   return str;
88 }
89 
90 template <class T>
utf8_substr(T str,size_t offset)91 T utf8_substr(T str, size_t offset) {
92   if (offset == 0) {
93     return str;
94   }
95   auto offset_pos = utf8_truncate(str, offset).size();
96   return str.substr(offset_pos);
97 }
98 
99 template <class T>
utf8_substr(T str,size_t offset,size_t length)100 T utf8_substr(T str, size_t offset, size_t length) {
101   return utf8_truncate(utf8_substr(str, offset), length);
102 }
103 
104 template <class T>
utf8_utf16_substr(T str,size_t offset)105 T utf8_utf16_substr(T str, size_t offset) {
106   if (offset == 0) {
107     return str;
108   }
109   auto offset_pos = utf8_utf16_truncate(str, offset).size();
110   return str.substr(offset_pos);
111 }
112 
113 template <class T>
utf8_utf16_substr(T str,size_t offset,size_t length)114 T utf8_utf16_substr(T str, size_t offset, size_t length) {
115   return utf8_utf16_truncate(utf8_utf16_substr(str, offset), length);
116 }
117 
118 /// Returns UTF-8 string converted to lower case.
119 string utf8_to_lower(Slice str);
120 
121 }  // namespace td
122