1 /* Copyright (c) 2006-2018 Dovecot authors, see the included COPYING file */
2 
3 #include "lib.h"
4 #include "array.h"
5 #include "file-lock.h"
6 #include "istream.h"
7 #include "time-util.h"
8 #include "unichar.h"
9 #include "squat-trie.h"
10 #include "squat-uidlist.h"
11 
12 #include <stdio.h>
13 #include <unistd.h>
14 #include <fcntl.h>
15 #include <time.h>
16 #include <sys/time.h>
17 
result_print(ARRAY_TYPE (seq_range)* result)18 static void result_print(ARRAY_TYPE(seq_range) *result)
19 {
20 	const struct seq_range *range;
21 	unsigned int i, count;
22 
23 	range = array_get(result, &count);
24 	for (i = 0; i < count; i++) {
25 		if (i != 0)
26 			printf(",");
27 		printf("%u", range[i].seq1);
28 		if (range[i].seq1 != range[i].seq2)
29 			printf("-%u", range[i].seq2);
30 	}
31 	printf("\n");
32 }
33 
main(int argc ATTR_UNUSED,char * argv[])34 int main(int argc ATTR_UNUSED, char *argv[])
35 {
36 	const char *trie_path = "/tmp/squat-test-index.search";
37 	const char *uidlist_path = "/tmp/squat-test-index.search.uids";
38 	struct squat_trie *trie;
39 	struct squat_trie_build_context *build_ctx;
40 	struct istream *input;
41 	struct stat trie_st, uidlist_st;
42 	ARRAY_TYPE(seq_range) definite_uids, maybe_uids;
43 	char *line, *str, buf[4096];
44 	buffer_t *valid;
45 	int ret, fd;
46 	unsigned int last = 0, seq = 1, node_count, uidlist_count;
47 	size_t len;
48 	enum squat_index_type index_type;
49 	bool data_header = TRUE, first = TRUE, skip_body = FALSE;
50 	bool mime_header = TRUE;
51 	size_t trie_mem, uidlist_mem;
52 	clock_t clock_start, clock_end;
53 	struct timeval tv_start, tv_end;
54 	double cputime;
55 
56 	lib_init();
57 	i_unlink_if_exists(trie_path);
58 	i_unlink_if_exists(uidlist_path);
59 	trie = squat_trie_init(trie_path, time(NULL),
60 			       FILE_LOCK_METHOD_FCNTL, 0, 0600, (gid_t)-1);
61 
62 	clock_start = clock();
63 	i_gettimeofday(&tv_start);
64 
65 	fd = open(argv[1], O_RDONLY);
66 	if (fd == -1)
67 		return 1;
68 
69 	if (squat_trie_build_init(trie, &build_ctx) < 0)
70 		return 1;
71 
72 	valid = buffer_create_dynamic(default_pool, 4096);
73 	input = i_stream_create_fd(fd, SIZE_MAX);
74 	ret = 0;
75 	while (ret == 0 && (line = i_stream_read_next_line(input)) != NULL) {
76 		if (last != input->v_offset/(1024*100)) {
77 			fprintf(stderr, "\r%ukB", (unsigned)(input->v_offset/1024));
78 			fflush(stderr);
79 			last = input->v_offset/(1024*100);
80 		}
81 		if (str_begins(line, "From ")) {
82 			if (!first)
83 				seq++;
84 			data_header = TRUE;
85 			skip_body = FALSE;
86 			mime_header = TRUE;
87 			continue;
88 		}
89 		first = FALSE;
90 
91 		if (str_begins(line, "--")) {
92 			skip_body = FALSE;
93 			mime_header = TRUE;
94 		}
95 
96 		if (mime_header) {
97 			if (*line == '\0') {
98 				data_header = FALSE;
99 				mime_header = FALSE;
100 				continue;
101 			}
102 
103 			if (strncasecmp(line, "Content-Type:", 13) == 0 &&
104 			    strncasecmp(line, "Content-Type: text/", 19) != 0 &&
105 			    strncasecmp(line, "Content-Type: message/", 22) != 0)
106 				skip_body = TRUE;
107 			else if (strncasecmp(line, "Content-Transfer-Encoding: base64", 33) == 0)
108 				skip_body = TRUE;
109 		} else if (skip_body)
110 			continue;
111 		if (*line == '\0')
112 			continue;
113 
114 		/* we're actually indexing here headers as bodies and bodies
115 		   as headers. it doesn't really matter in this test, and
116 		   fixing it would require storing headers temporarily
117 		   elsewhere and index them only after the body */
118 		index_type = !data_header ? SQUAT_INDEX_TYPE_HEADER :
119 			SQUAT_INDEX_TYPE_BODY;
120 
121 		buffer_set_used_size(valid, 0);
122 		len = strlen(line);
123 		if (uni_utf8_get_valid_data((const unsigned char *)line,
124 					    len, valid)) {
125 			ret = squat_trie_build_more(build_ctx, seq, index_type,
126 						    (const void *)line, len);
127 		} else if (valid->used > 0) {
128 			ret = squat_trie_build_more(build_ctx, seq, index_type,
129 						    valid->data, valid->used);
130 		}
131 	}
132 	buffer_free(&valid);
133 	if (squat_trie_build_deinit(&build_ctx, NULL) < 0)
134 		ret = -1;
135 	if (ret < 0) {
136 		printf("build broken\n");
137 		return 1;
138 	}
139 
140 	clock_end = clock();
141 	i_gettimeofday(&tv_end);
142 
143 	cputime = (double)(clock_end - clock_start) / CLOCKS_PER_SEC;
144 	fprintf(stderr, "\n - Index time: %.2f CPU seconds, "
145 		"%.2f real seconds (%.02fMB/CPUs)\n", cputime,
146 		timeval_diff_msecs(&tv_end, &tv_start)/1000.0,
147 		input->v_offset / cputime / (1024*1024));
148 
149 	if (stat(trie_path, &trie_st) < 0)
150 		i_error("stat(%s) failed: %m", trie_path);
151 	if (stat(uidlist_path, &uidlist_st) < 0)
152 		i_error("stat(%s) failed: %m", uidlist_path);
153 
154 	trie_mem = squat_trie_mem_used(trie, &node_count);
155 	uidlist_mem = squat_uidlist_mem_used(squat_trie_get_uidlist(trie),
156 					     &uidlist_count);
157 	fprintf(stderr, " - memory: %uk for trie, %uk for uidlist\n",
158 		(unsigned)(trie_mem/1024), (unsigned)(uidlist_mem/1024));
159 	fprintf(stderr, " - %"PRIuUOFF_T" bytes in %u nodes (%.02f%%)\n",
160 		trie_st.st_size, node_count,
161 		trie_st.st_size / (float)input->v_offset * 100.0);
162 	fprintf(stderr, " - %"PRIuUOFF_T" bytes in %u UID lists (%.02f%%)\n",
163 		uidlist_st.st_size, uidlist_count,
164 		uidlist_st.st_size / (float)input->v_offset * 100.0);
165 	fprintf(stderr, " - %"PRIuUOFF_T" bytes total of %"
166 		PRIuUOFF_T" (%.02f%%)\n",
167 		(trie_st.st_size + uidlist_st.st_size), input->v_offset,
168 		(trie_st.st_size + uidlist_st.st_size) /
169 		(float)input->v_offset * 100.0);
170 
171 	i_stream_unref(&input);
172 	i_close_fd(&fd);
173 
174 	i_array_init(&definite_uids, 128);
175 	i_array_init(&maybe_uids, 128);
176 	while ((str = fgets(buf, sizeof(buf), stdin)) != NULL) {
177 		ret = strlen(str)-1;
178 		str[ret] = 0;
179 
180 		i_gettimeofday(&tv_start);
181 		ret = squat_trie_lookup(trie, str, SQUAT_INDEX_TYPE_HEADER |
182 					SQUAT_INDEX_TYPE_BODY,
183 					&definite_uids, &maybe_uids);
184 		if (ret < 0)
185 			printf("error\n");
186 		else {
187 			i_gettimeofday(&tv_end);
188 			printf(" - Search took %.05f CPU seconds\n",
189 			       timeval_diff_usecs(&tv_end, &tv_start)/1000000.0);
190 			printf(" - definite uids: ");
191 			result_print(&definite_uids);
192 			printf(" - maybe uids: ");
193 			result_print(&maybe_uids);
194 		}
195 	}
196 	return 0;
197 }
198