1 /*
2  * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
3  * Version 2, December 2004
4  *
5  * Copyright (C) 2012-2013 Sebastien Tricaud <sebastien@honeynet.org>
6  * Copyright (C) 2013 Cedric LE ROUX <cedric.lrx@gmail.com>
7  *
8  * Everyone is permitted to copy and distribute verbatim or modified
9  * copies of this license document, and changing it is allowed as long
10  * as the name is changed.
11  *
12  * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
13  * TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
14  *
15  *  0. You just DO WHAT THE FUCK YOU WANT TO.
16  *
17  */
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 
23 #ifdef LINUX
24 #define __USE_BSD
25 #endif
26 
27 
28 #include <faup/faup.h>
29 #include <faup/tld.h>
30 #include <faup/tld-tree.h>
31 
32 #include <faup/utarray.h>
33 
34 static UT_array *_tlds;
35 
faup_tld_tree_debug(TLDNode * tld_tree,int pos,char * type)36 void faup_tld_tree_debug(TLDNode *tld_tree, int pos, char *type)
37 {
38  	printf("[%d] type: '%s'\n", pos, type);
39  	printf("[%d] c   : '%c'\n", pos, tld_tree->c);
40  	printf("[%d] EoT : %d\n", pos, tld_tree->EoT);
41  	if (tld_tree->sibling) {
42 		faup_tld_tree_debug(tld_tree->sibling, pos+1, "sibling");
43 	}
44 
45 	if (tld_tree->kid) {
46 		faup_tld_tree_debug(tld_tree->kid, pos+1, "kid");
47 	}
48 }
49 
50 
_faup_tld_tree_allocate_kid(TLDNode ** Node,char c,bool EoT,bool move_cursor)51 static int _faup_tld_tree_allocate_kid(TLDNode **Node, char c, bool EoT, bool move_cursor)
52 {
53 	if( (*Node)->kid != NULL )
54 		return -1;
55 
56 	//	printf("kid\n");
57 
58 	(*Node)->kid = calloc(1, sizeof(TLDNode));
59 	if( (*Node)->kid == NULL )
60 		return -1;
61 
62 	((TLDNode *)(*Node)->kid)->c   = c;
63 	((TLDNode *)(*Node)->kid)->EoT = EoT;
64 
65 	if( move_cursor ) {
66 		*Node = (TLDNode *)(*Node)->kid;
67 	}
68 	return 0;
69 }
70 
_faup_tld_tree_allocate_sibling(TLDNode ** Node,char c,bool EoT,bool move_cursor)71 static int _faup_tld_tree_allocate_sibling(TLDNode **Node, char c, bool EoT, bool move_cursor)
72 {
73 	if( (*Node)->sibling != NULL )
74 		return -1;
75 
76 	//	printf("sibling\n");
77 
78 	(*Node)->sibling = calloc(1, sizeof(TLDNode));
79 	if( (*Node)->sibling== NULL ) {
80 		return -1;
81 	}
82 
83 	((TLDNode *)(*Node)->sibling)->c   = c;
84 	((TLDNode *)(*Node)->sibling)->EoT = EoT;
85 
86 	if( move_cursor ) {
87 		*Node = (TLDNode *)(*Node)->sibling;
88 	}
89 	return 0;
90 }
91 
92 /*
93  * Add a node the the Trie (should be kid or sibling of the root)
94  *
95  */
_faup_tld_tree_add_node(TLDNode ** Tree,char * tld,int tld_len)96 static int _faup_tld_tree_add_node(TLDNode **Tree, char *tld, int tld_len)
97 {
98 	bool lastChar, nextIsDot, nextIsException;
99 	char *p;
100 	int ret;
101 	TLDNode *pNode  = *Tree;
102 	int counter = 0;
103 
104 	//printf("Adding the TLD:[%s]\n", tld);
105 
106 	// Add the TLD to the Trie in reverse order
107 	p = tld + tld_len -1;
108 	while(counter < tld_len)
109 	{
110 		lastChar        = (counter+1 == tld_len) ? true : false;
111 		if (counter+1 < tld_len) {
112 		  nextIsDot       = (*(p-1) == '.') ? true  : false;
113 		  nextIsException = (*(p-1) == '!') ? true  : false;
114 		} else {
115 		  nextIsDot = false;
116 		  nextIsException = false;
117 		}
118 
119 		// char does not exist in the Trie at that position
120 		if( pNode->kid == NULL )
121 		{
122 			ret = _faup_tld_tree_allocate_kid(&pNode, *p, lastChar | nextIsDot | nextIsException, true);
123 			if( ret == -1 ) {
124 				return -1;
125 			}
126 		}
127 		// char may exist in the Trie
128 		else
129 		{
130 			pNode = pNode->kid;
131 
132 			while( (pNode->sibling != NULL) && (pNode->c != *p) ) {
133 				pNode = pNode->sibling;
134 			}
135 
136 			// char does not exist
137 			if( pNode->c != *p )
138 			{
139 				ret = _faup_tld_tree_allocate_sibling(&pNode, *p, lastChar | nextIsDot | nextIsException, true);
140 				if( ret == -1 ) {
141 					return -1;
142 				}
143 			}
144 			// char already exist at that position but not necessary as an ended one.
145 			else if( lastChar || nextIsDot || nextIsException ) {
146 				pNode->EoT = true;
147 			}
148 		}
149 
150 		counter++;
151 		p--;
152 	}
153 
154 	return 0;
155 }
156 
157 
158 /*
159  * Add a node to the Trie
160  * Define whether it's an exception (!<domain.tld>) or a regular TLD (including wildcards ones)
161  * Exception go under the Tree root's kid part, regular under the root's sibling.
162  */
faup_tld_tree_add_node(TLDNode ** Tree,char * TLD,int tld_len)163 static int faup_tld_tree_add_node(TLDNode **Tree, char *TLD, int tld_len)
164 {
165 	TLDNode *pNode;
166 
167 	if (!Tree) {
168 		fprintf(stderr, "%s Tree does not exists!\n", __FUNCTION__);
169 		return -1;
170 	}
171 
172 	// regular tld
173 	if( *TLD != '!' )
174 	{
175 		// First node
176 		if( (*Tree)->kid == NULL )
177 		{
178 			(*Tree)->kid = calloc(1, sizeof(TLDNode));
179 			if( (*Tree)->kid == NULL ) {
180 				return -1;
181 			}
182 
183 			(*Tree)->kid->c = '\0';
184 		}
185 		pNode = (*Tree)->kid;
186 	}
187 	// exception
188 	else
189 	{
190 		if( (*Tree)->sibling == NULL )
191 		{
192 			(*Tree)->sibling = calloc(1, sizeof(TLDNode));
193 			if( (*Tree)->sibling == NULL ) {
194 				return -1;
195 			}
196 
197 			(*Tree)->sibling->c = '\0';
198 		}
199 		pNode = (*Tree)->sibling;
200 	}
201 
202 	return _faup_tld_tree_add_node(&pNode, TLD, tld_len);
203 }
204 
205 
206 
faup_tld_tree_add_tld(char * tld,void * user_data)207 static void faup_tld_tree_add_tld(char *tld, void *user_data)
208 {
209 	int retval;
210 
211 	TLDNode *Tree = (TLDNode *)user_data;
212 
213 	retval = faup_tld_tree_add_node(&Tree, tld, strlen(tld));
214 	if (retval) {
215 		fprintf(stderr, "Error adding the tld '%s' to the tree\n", tld);
216 	}
217 
218 }
219 
220 /* Return a TLD Tree (TLDNode *) or NULL on error
221  *
222  * Load TLD from a char array and transform it as a 'reversed' Tree (= Trie).
223  * Ex: '.com' is loaded as 'm->o->c->.'
224  *
225  * There is actually 2 Tree :
226  * Tree->siblings contains TLD and wildcards (ex: *.om)
227  * Tree->kids     contains exceptions TLD (ex: !siemens.om)
228  *
229  * See :
230  * - http://en.wikipedia.org/wiki/Trie
231  * - http://en.wikipedia.org/wiki/Suffix_tree
232  */
233 
faup_tld_tree_new(void)234 TLDNode *faup_tld_tree_new(void)
235 {
236 	TLDNode *Tree;
237 	int nbTLD, ret, i;
238 
239 	// Initialize the tree
240 	Tree = calloc(1, sizeof(TLDNode));
241 	if( Tree == NULL ) {
242 		return NULL;
243 	}
244 	Tree->c = '\0';
245 
246 	ret = faup_tld_array_populate();
247 	if (ret < 0) {
248 	  fprintf(stderr, "Error with faup_tld_array_populate from %s; Symptom: double initialization?\n", __FUNCTION__);
249 	  return NULL;
250 	}
251 	faup_tld_array_foreach(faup_tld_tree_add_tld, Tree);
252 	faup_tld_array_destroy();
253 
254 	return Tree;
255 }
256 
257 
faup_tld_tree_free(TLDNode * tld_tree,TLDNode * last_tld_tree,int pos)258 void faup_tld_tree_free(TLDNode *tld_tree, TLDNode *last_tld_tree, int pos)
259 {
260   if (!tld_tree) return;
261 
262   if (tld_tree->kid) {
263     faup_tld_tree_free(tld_tree->kid, tld_tree, pos+1);
264   }
265   if (tld_tree->sibling) {
266     faup_tld_tree_free(tld_tree->sibling, tld_tree, pos+1);
267   }
268 
269   free(tld_tree);
270 }
271 
272 /*
273  * Return TRUE if the provided tld is found in the provided Tree.
274  * or if the provided tld match a wildcard tld in the Tree.
275  * FALSE in any other case (no match)
276  *
277  */
faup_tld_tree_tld_exists(TLDNode * Tree,const char * tld,int tld_len)278 static bool faup_tld_tree_tld_exists(TLDNode *Tree, const char *tld, int tld_len)
279 {
280 	TLDNode *pNode = Tree;
281 	const char *p = NULL;
282 	bool wildcard;
283 	int lenght = 0;
284 
285 	if (!Tree) {
286 		fprintf(stderr, "Error: Tree does not exists!\n");
287 		return false;
288 	}
289 
290 	p = tld + tld_len - 1;
291 	while (tld_len-- > 0) {
292 		wildcard = false;
293 		pNode = pNode->kid;
294 
295 		while( pNode && (pNode->c != *p) ) {
296 			if (pNode->c == '*') {
297 				wildcard = true;
298 			}
299 			pNode = pNode->sibling;
300 		}
301 
302 		if( ! pNode ) {
303 			if( wildcard ) {
304 				while( tld_len-- ) {
305 					if( tld_len && (*(--p) == '.') ) {
306 						return false;
307 					}
308 				}
309 				return true;
310 			}
311 			return false;
312 		}
313 
314 		p--;
315 	}
316 
317 	if( pNode->EoT ) {
318 		return true;
319 	}
320 
321 	return false;
322 }
323 
324 /*
325  * Return the starting position of the tld in host or -1 if not found
326  * Require a TLD Tree.
327  * Ex:
328  * - google.com => 7 (='com')
329  * - abc.co.uk  => 4 (='co.uk')
330  * - google.nawak => -1 (NOT FOUND)
331  *
332  */
faup_tld_tree_extract(faup_handler_t * fh,TLDNode * tld_tree)333 faup_tld_tree_extracted_t faup_tld_tree_extract(faup_handler_t *fh, TLDNode *tld_tree)
334 {
335 	const char *p;
336 	int32_t host_last_pos;
337 	char *last;
338 	bool found;
339 	uint32_t step = 0;
340 	uint32_t tld_len = 0;
341 	uint32_t tld_exception_len = 0;
342 	uint32_t last_len_just_for_host = 0;
343 	size_t last_len;
344 	size_t org_str_len;
345 
346 	faup_tld_tree_extracted_t tld_extracted;
347 
348 	uint32_t counter;
349 	bool has_a_dot = false;
350 
351 	tld_extracted.pos = -1;
352 	tld_extracted.size = 0;
353 
354 	if (!tld_tree) {
355 		fprintf(stderr, "(Error) No TLD Tree given!\n");
356 		return tld_extracted;
357 	}
358 
359 	if (!tld_tree->kid) {
360 		fprintf(stderr, "(Warning) Cannot extract TLD > 1. Mozilla list does not exists. Please download it (faup -u)\n");
361 		return tld_extracted;
362 	}
363 
364 	last = NULL;
365 
366 	host_last_pos = fh->faup.features.host.pos + fh->faup.features.host.size;
367 
368 	p = fh->faup.org_str + host_last_pos - 1;
369 
370 	counter = fh->faup.features.host.size - 1;
371 	while( counter )
372 	{
373 		while( *(p-1) && (*p != '.') ) {
374 			p--;
375 			counter--;
376 			if (counter <= 0) break;
377 		}
378 
379 		step = ( *p == '.' ) ? 1 : 0;
380 
381 		found = faup_tld_tree_tld_exists(tld_tree->kid, p + step, fh->faup.features.host.size - counter - 1);
382 		if( ! found ) {
383 			break;
384 		} else {
385 			tld_len = fh->faup.features.host.size - counter - 1;
386 		}
387 
388 		last = (char *) p + step;
389 
390 		p--;
391 		counter--;
392 	}
393 
394 	if( last == NULL ) {
395 		return tld_extracted;
396 	}
397 
398 
399 	// We want to retrieve the size of the tld without the useless chars the come afterwards
400 	// www.foo.siemens.om/tagada != www.foo.siemens.om
401 	last_len = counter;
402 	last_len_just_for_host = last_len - (fh->faup.org_str_len - (fh->faup.features.host.pos + fh->faup.features.host.size));
403 
404 	counter = 0;
405 	if (tld_tree->sibling) {
406 		found = faup_tld_tree_tld_exists(tld_tree->sibling, last, last_len_just_for_host);
407 		if( found )
408 		{
409 			// here we have the longest TLD
410 			// but is that an exception ? (ex: !siemens.om vs *.om)
411 			while (counter < tld_len) {
412 				if (*last != '.') {
413 					last++;
414 					tld_exception_len++;
415 				} else {
416 					has_a_dot = true;
417 					break;
418 				}
419 				counter++;
420 			}
421 		}
422 	}
423 
424 	tld_extracted.size = tld_len - tld_exception_len;
425 	if (!tld_extracted.size) {
426 		tld_extracted.size = tld_len;
427 	}
428 
429 	//printf("fh->faup.features.host.pos(%zd), fh->faup.features.host.size(%zd), tld_extracted.size(%zd), counter(%zd)\n", fh->faup.features.host.pos, fh->faup.features.host.size, tld_extracted.size, counter);
430 	tld_extracted.pos = fh->faup.features.host.pos + fh->faup.features.host.size - tld_extracted.size;
431 
432 	if (has_a_dot) {
433 		tld_extracted.pos += 1;
434 		tld_extracted.size -= 1;
435 	}
436 
437 //	printf("tld_extracted.size=%zd;tld_extracted.pos=%zd\n", tld_extracted.size, tld_extracted.pos);
438 
439 	return tld_extracted;
440 }
441 
442