1 /* Copyright (c) 2004-2018 Dovecot authors, see the included COPYING file */
2 
3 #include "lib.h"
4 #include "unichar.h"
5 #include "str.h"
6 #include "str-sanitize.h"
7 
str_sanitize_skip_start(const char * src,size_t max_bytes)8 static size_t str_sanitize_skip_start(const char *src, size_t max_bytes)
9 {
10 	unichar_t chr;
11 	size_t i;
12 
13 	for (i = 0; i < max_bytes && src[i] != '\0'; ) {
14 		int len = uni_utf8_get_char_n(src+i, max_bytes-i, &chr);
15 		if (len <= 0)
16 			break;
17 		if ((unsigned char)src[i] < 32)
18 			break;
19 		i += len;
20 	}
21 	i_assert(i <= max_bytes);
22 	return i;
23 }
24 
25 
26 static size_t
str_sanitize_skip_start_utf8(const char * src,uintmax_t max_chars)27 str_sanitize_skip_start_utf8(const char *src, uintmax_t max_chars)
28 {
29 	unichar_t chr;
30 	uintmax_t c;
31 	size_t i;
32 
33 	for (i = 0, c = 0; c < max_chars && src[i] != '\0'; ) {
34 		int len = uni_utf8_get_char(src+i, &chr);
35 		if (len <= 0)
36 			break;
37 		if ((unsigned char)src[i] < 32)
38 			break;
39 		c++;
40 		i += len;
41 	}
42 	i_assert(c <= max_chars);
43 	return i;
44 }
45 
str_sanitize_truncate_char(string_t * dest,unsigned int initial_pos)46 static void str_sanitize_truncate_char(string_t *dest, unsigned int initial_pos)
47 {
48 	const unsigned char *data = str_data(dest);
49 	size_t len = str_len(dest);
50 
51 	i_assert(len >= initial_pos);
52 	if (len == initial_pos)
53 		return;
54 
55 	data += initial_pos;
56 	len -= initial_pos;
57 	str_truncate(dest, initial_pos +
58 		uni_utf8_data_truncate(data, len, len-1));
59 }
60 
str_sanitize_append(string_t * dest,const char * src,size_t max_bytes)61 void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes)
62 {
63 	size_t initial_pos = str_len(dest);
64 	unichar_t chr;
65 	size_t i;
66 
67 	for (i = 0; i < max_bytes && src[i] != '\0'; ) {
68 		int len = uni_utf8_get_char_n(src+i, max_bytes-i, &chr);
69 		if (len == 0)
70 			break; /* input ended too early */
71 
72 		if (len < 0) {
73 			/* invalid UTF-8 */
74 			str_append_c(dest, '?');
75 			i++;
76 			continue;
77 		}
78 		if ((unsigned char)src[i] < 32)
79 			str_append_c(dest, '?');
80 		else
81 			str_append_data(dest, src+i, len);
82 		i += len;
83 	}
84 
85 	if (src[i] != '\0') {
86 		if (max_bytes < 3)
87 			str_truncate(dest, initial_pos);
88 		else {
89 			while (str_len(dest) - initial_pos > max_bytes-3)
90 				str_sanitize_truncate_char(dest, initial_pos);
91 		}
92 		str_append(dest, "...");
93 	}
94 }
95 
str_sanitize_append_utf8(string_t * dest,const char * src,uintmax_t max_cps)96 void str_sanitize_append_utf8(string_t *dest, const char *src,
97 			      uintmax_t max_cps)
98 {
99 	size_t last_pos = 0;
100 	unichar_t chr;
101 	uintmax_t c;
102 	size_t i;
103 
104 	i_assert(max_cps > 0);
105 
106 	for (i = 0, c = 0; c < max_cps && src[i] != '\0'; ) {
107 		int len = uni_utf8_get_char(src+i, &chr);
108 		if (len == 0)
109 			break; /* input ended too early */
110 
111 		last_pos = str_len(dest);
112 		if (len < 0) {
113 			/* invalid UTF-8 */
114 			str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
115 			i++;
116 			continue;
117 		}
118 		if ((unsigned char)src[i] < 32)
119 			str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
120 		else
121 			str_append_data(dest, src+i, len);
122 		i += len;
123 		c++;
124 	}
125 
126 	if (src[i] != '\0') {
127 		str_truncate(dest, last_pos);
128 		str_append(dest, UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8);
129 	}
130 }
131 
str_sanitize(const char * src,size_t max_bytes)132 const char *str_sanitize(const char *src, size_t max_bytes)
133 {
134 	string_t *str;
135 	size_t i;
136 
137 	if (src == NULL)
138 		return NULL;
139 
140 	i = str_sanitize_skip_start(src, max_bytes);
141 	if (src[i] == '\0')
142 		return src;
143 
144 	str = t_str_new(I_MIN(max_bytes, 256));
145 	str_sanitize_append(str, src, max_bytes);
146 	return str_c(str);
147 }
148 
str_sanitize_utf8(const char * src,uintmax_t max_cps)149 const char *str_sanitize_utf8(const char *src, uintmax_t max_cps)
150 {
151 	string_t *str;
152 	size_t i;
153 
154 	if (src == NULL)
155 		return NULL;
156 
157 	i = str_sanitize_skip_start_utf8(src, max_cps);
158 	if (src[i] == '\0')
159 		return src;
160 
161 	str = t_str_new(I_MIN(max_cps, 256));
162 	str_sanitize_append_utf8(str, src, max_cps);
163 	return str_c(str);
164 }
165 
166