1 /* Copyright (c) 2004-2018 Dovecot authors, see the included COPYING file */
2
3 #include "lib.h"
4 #include "unichar.h"
5 #include "str.h"
6 #include "str-sanitize.h"
7
str_sanitize_skip_start(const char * src,size_t max_bytes)8 static size_t str_sanitize_skip_start(const char *src, size_t max_bytes)
9 {
10 unichar_t chr;
11 size_t i;
12
13 for (i = 0; i < max_bytes && src[i] != '\0'; ) {
14 int len = uni_utf8_get_char_n(src+i, max_bytes-i, &chr);
15 if (len <= 0)
16 break;
17 if ((unsigned char)src[i] < 32)
18 break;
19 i += len;
20 }
21 i_assert(i <= max_bytes);
22 return i;
23 }
24
25
26 static size_t
str_sanitize_skip_start_utf8(const char * src,uintmax_t max_chars)27 str_sanitize_skip_start_utf8(const char *src, uintmax_t max_chars)
28 {
29 unichar_t chr;
30 uintmax_t c;
31 size_t i;
32
33 for (i = 0, c = 0; c < max_chars && src[i] != '\0'; ) {
34 int len = uni_utf8_get_char(src+i, &chr);
35 if (len <= 0)
36 break;
37 if ((unsigned char)src[i] < 32)
38 break;
39 c++;
40 i += len;
41 }
42 i_assert(c <= max_chars);
43 return i;
44 }
45
str_sanitize_truncate_char(string_t * dest,unsigned int initial_pos)46 static void str_sanitize_truncate_char(string_t *dest, unsigned int initial_pos)
47 {
48 const unsigned char *data = str_data(dest);
49 size_t len = str_len(dest);
50
51 i_assert(len >= initial_pos);
52 if (len == initial_pos)
53 return;
54
55 data += initial_pos;
56 len -= initial_pos;
57 str_truncate(dest, initial_pos +
58 uni_utf8_data_truncate(data, len, len-1));
59 }
60
str_sanitize_append(string_t * dest,const char * src,size_t max_bytes)61 void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes)
62 {
63 size_t initial_pos = str_len(dest);
64 unichar_t chr;
65 size_t i;
66
67 for (i = 0; i < max_bytes && src[i] != '\0'; ) {
68 int len = uni_utf8_get_char_n(src+i, max_bytes-i, &chr);
69 if (len == 0)
70 break; /* input ended too early */
71
72 if (len < 0) {
73 /* invalid UTF-8 */
74 str_append_c(dest, '?');
75 i++;
76 continue;
77 }
78 if ((unsigned char)src[i] < 32)
79 str_append_c(dest, '?');
80 else
81 str_append_data(dest, src+i, len);
82 i += len;
83 }
84
85 if (src[i] != '\0') {
86 if (max_bytes < 3)
87 str_truncate(dest, initial_pos);
88 else {
89 while (str_len(dest) - initial_pos > max_bytes-3)
90 str_sanitize_truncate_char(dest, initial_pos);
91 }
92 str_append(dest, "...");
93 }
94 }
95
str_sanitize_append_utf8(string_t * dest,const char * src,uintmax_t max_cps)96 void str_sanitize_append_utf8(string_t *dest, const char *src,
97 uintmax_t max_cps)
98 {
99 size_t last_pos = 0;
100 unichar_t chr;
101 uintmax_t c;
102 size_t i;
103
104 i_assert(max_cps > 0);
105
106 for (i = 0, c = 0; c < max_cps && src[i] != '\0'; ) {
107 int len = uni_utf8_get_char(src+i, &chr);
108 if (len == 0)
109 break; /* input ended too early */
110
111 last_pos = str_len(dest);
112 if (len < 0) {
113 /* invalid UTF-8 */
114 str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
115 i++;
116 continue;
117 }
118 if ((unsigned char)src[i] < 32)
119 str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
120 else
121 str_append_data(dest, src+i, len);
122 i += len;
123 c++;
124 }
125
126 if (src[i] != '\0') {
127 str_truncate(dest, last_pos);
128 str_append(dest, UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8);
129 }
130 }
131
str_sanitize(const char * src,size_t max_bytes)132 const char *str_sanitize(const char *src, size_t max_bytes)
133 {
134 string_t *str;
135 size_t i;
136
137 if (src == NULL)
138 return NULL;
139
140 i = str_sanitize_skip_start(src, max_bytes);
141 if (src[i] == '\0')
142 return src;
143
144 str = t_str_new(I_MIN(max_bytes, 256));
145 str_sanitize_append(str, src, max_bytes);
146 return str_c(str);
147 }
148
str_sanitize_utf8(const char * src,uintmax_t max_cps)149 const char *str_sanitize_utf8(const char *src, uintmax_t max_cps)
150 {
151 string_t *str;
152 size_t i;
153
154 if (src == NULL)
155 return NULL;
156
157 i = str_sanitize_skip_start_utf8(src, max_cps);
158 if (src[i] == '\0')
159 return src;
160
161 str = t_str_new(I_MIN(max_cps, 256));
162 str_sanitize_append_utf8(str, src, max_cps);
163 return str_c(str);
164 }
165
166