1 /*
2 * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
3 * Version 2, December 2004
4 *
5 * Copyright (C) 2012-2013 Sebastien Tricaud <sebastien@honeynet.org>
6 * Copyright (C) 2013 Cedric LE ROUX <cedric.lrx@gmail.com>
7 *
8 * Everyone is permitted to copy and distribute verbatim or modified
9 * copies of this license document, and changing it is allowed as long
10 * as the name is changed.
11 *
12 * DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
13 * TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
14 *
15 * 0. You just DO WHAT THE FUCK YOU WANT TO.
16 *
17 */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #ifdef LINUX
24 #define __USE_BSD
25 #endif
26
27
28 #include <faup/faup.h>
29 #include <faup/tld.h>
30 #include <faup/tld-tree.h>
31
32 #include <faup/utarray.h>
33
34 static UT_array *_tlds;
35
faup_tld_tree_debug(TLDNode * tld_tree,int pos,char * type)36 void faup_tld_tree_debug(TLDNode *tld_tree, int pos, char *type)
37 {
38 printf("[%d] type: '%s'\n", pos, type);
39 printf("[%d] c : '%c'\n", pos, tld_tree->c);
40 printf("[%d] EoT : %d\n", pos, tld_tree->EoT);
41 if (tld_tree->sibling) {
42 faup_tld_tree_debug(tld_tree->sibling, pos+1, "sibling");
43 }
44
45 if (tld_tree->kid) {
46 faup_tld_tree_debug(tld_tree->kid, pos+1, "kid");
47 }
48 }
49
50
_faup_tld_tree_allocate_kid(TLDNode ** Node,char c,bool EoT,bool move_cursor)51 static int _faup_tld_tree_allocate_kid(TLDNode **Node, char c, bool EoT, bool move_cursor)
52 {
53 if( (*Node)->kid != NULL )
54 return -1;
55
56 // printf("kid\n");
57
58 (*Node)->kid = calloc(1, sizeof(TLDNode));
59 if( (*Node)->kid == NULL )
60 return -1;
61
62 ((TLDNode *)(*Node)->kid)->c = c;
63 ((TLDNode *)(*Node)->kid)->EoT = EoT;
64
65 if( move_cursor ) {
66 *Node = (TLDNode *)(*Node)->kid;
67 }
68 return 0;
69 }
70
_faup_tld_tree_allocate_sibling(TLDNode ** Node,char c,bool EoT,bool move_cursor)71 static int _faup_tld_tree_allocate_sibling(TLDNode **Node, char c, bool EoT, bool move_cursor)
72 {
73 if( (*Node)->sibling != NULL )
74 return -1;
75
76 // printf("sibling\n");
77
78 (*Node)->sibling = calloc(1, sizeof(TLDNode));
79 if( (*Node)->sibling== NULL ) {
80 return -1;
81 }
82
83 ((TLDNode *)(*Node)->sibling)->c = c;
84 ((TLDNode *)(*Node)->sibling)->EoT = EoT;
85
86 if( move_cursor ) {
87 *Node = (TLDNode *)(*Node)->sibling;
88 }
89 return 0;
90 }
91
92 /*
93 * Add a node the the Trie (should be kid or sibling of the root)
94 *
95 */
_faup_tld_tree_add_node(TLDNode ** Tree,char * tld,int tld_len)96 static int _faup_tld_tree_add_node(TLDNode **Tree, char *tld, int tld_len)
97 {
98 bool lastChar, nextIsDot, nextIsException;
99 char *p;
100 int ret;
101 TLDNode *pNode = *Tree;
102 int counter = 0;
103
104 //printf("Adding the TLD:[%s]\n", tld);
105
106 // Add the TLD to the Trie in reverse order
107 p = tld + tld_len -1;
108 while(counter < tld_len)
109 {
110 lastChar = (counter+1 == tld_len) ? true : false;
111 if (counter+1 < tld_len) {
112 nextIsDot = (*(p-1) == '.') ? true : false;
113 nextIsException = (*(p-1) == '!') ? true : false;
114 } else {
115 nextIsDot = false;
116 nextIsException = false;
117 }
118
119 // char does not exist in the Trie at that position
120 if( pNode->kid == NULL )
121 {
122 ret = _faup_tld_tree_allocate_kid(&pNode, *p, lastChar | nextIsDot | nextIsException, true);
123 if( ret == -1 ) {
124 return -1;
125 }
126 }
127 // char may exist in the Trie
128 else
129 {
130 pNode = pNode->kid;
131
132 while( (pNode->sibling != NULL) && (pNode->c != *p) ) {
133 pNode = pNode->sibling;
134 }
135
136 // char does not exist
137 if( pNode->c != *p )
138 {
139 ret = _faup_tld_tree_allocate_sibling(&pNode, *p, lastChar | nextIsDot | nextIsException, true);
140 if( ret == -1 ) {
141 return -1;
142 }
143 }
144 // char already exist at that position but not necessary as an ended one.
145 else if( lastChar || nextIsDot || nextIsException ) {
146 pNode->EoT = true;
147 }
148 }
149
150 counter++;
151 p--;
152 }
153
154 return 0;
155 }
156
157
158 /*
159 * Add a node to the Trie
160 * Define whether it's an exception (!<domain.tld>) or a regular TLD (including wildcards ones)
161 * Exception go under the Tree root's kid part, regular under the root's sibling.
162 */
faup_tld_tree_add_node(TLDNode ** Tree,char * TLD,int tld_len)163 static int faup_tld_tree_add_node(TLDNode **Tree, char *TLD, int tld_len)
164 {
165 TLDNode *pNode;
166
167 if (!Tree) {
168 fprintf(stderr, "%s Tree does not exists!\n", __FUNCTION__);
169 return -1;
170 }
171
172 // regular tld
173 if( *TLD != '!' )
174 {
175 // First node
176 if( (*Tree)->kid == NULL )
177 {
178 (*Tree)->kid = calloc(1, sizeof(TLDNode));
179 if( (*Tree)->kid == NULL ) {
180 return -1;
181 }
182
183 (*Tree)->kid->c = '\0';
184 }
185 pNode = (*Tree)->kid;
186 }
187 // exception
188 else
189 {
190 if( (*Tree)->sibling == NULL )
191 {
192 (*Tree)->sibling = calloc(1, sizeof(TLDNode));
193 if( (*Tree)->sibling == NULL ) {
194 return -1;
195 }
196
197 (*Tree)->sibling->c = '\0';
198 }
199 pNode = (*Tree)->sibling;
200 }
201
202 return _faup_tld_tree_add_node(&pNode, TLD, tld_len);
203 }
204
205
206
faup_tld_tree_add_tld(char * tld,void * user_data)207 static void faup_tld_tree_add_tld(char *tld, void *user_data)
208 {
209 int retval;
210
211 TLDNode *Tree = (TLDNode *)user_data;
212
213 retval = faup_tld_tree_add_node(&Tree, tld, strlen(tld));
214 if (retval) {
215 fprintf(stderr, "Error adding the tld '%s' to the tree\n", tld);
216 }
217
218 }
219
220 /* Return a TLD Tree (TLDNode *) or NULL on error
221 *
222 * Load TLD from a char array and transform it as a 'reversed' Tree (= Trie).
223 * Ex: '.com' is loaded as 'm->o->c->.'
224 *
225 * There is actually 2 Tree :
226 * Tree->siblings contains TLD and wildcards (ex: *.om)
227 * Tree->kids contains exceptions TLD (ex: !siemens.om)
228 *
229 * See :
230 * - http://en.wikipedia.org/wiki/Trie
231 * - http://en.wikipedia.org/wiki/Suffix_tree
232 */
233
faup_tld_tree_new(void)234 TLDNode *faup_tld_tree_new(void)
235 {
236 TLDNode *Tree;
237 int nbTLD, ret, i;
238
239 // Initialize the tree
240 Tree = calloc(1, sizeof(TLDNode));
241 if( Tree == NULL ) {
242 return NULL;
243 }
244 Tree->c = '\0';
245
246 ret = faup_tld_array_populate();
247 if (ret < 0) {
248 fprintf(stderr, "Error with faup_tld_array_populate from %s; Symptom: double initialization?\n", __FUNCTION__);
249 return NULL;
250 }
251 faup_tld_array_foreach(faup_tld_tree_add_tld, Tree);
252 faup_tld_array_destroy();
253
254 return Tree;
255 }
256
257
faup_tld_tree_free(TLDNode * tld_tree,TLDNode * last_tld_tree,int pos)258 void faup_tld_tree_free(TLDNode *tld_tree, TLDNode *last_tld_tree, int pos)
259 {
260 if (!tld_tree) return;
261
262 if (tld_tree->kid) {
263 faup_tld_tree_free(tld_tree->kid, tld_tree, pos+1);
264 }
265 if (tld_tree->sibling) {
266 faup_tld_tree_free(tld_tree->sibling, tld_tree, pos+1);
267 }
268
269 free(tld_tree);
270 }
271
272 /*
273 * Return TRUE if the provided tld is found in the provided Tree.
274 * or if the provided tld match a wildcard tld in the Tree.
275 * FALSE in any other case (no match)
276 *
277 */
faup_tld_tree_tld_exists(TLDNode * Tree,const char * tld,int tld_len)278 static bool faup_tld_tree_tld_exists(TLDNode *Tree, const char *tld, int tld_len)
279 {
280 TLDNode *pNode = Tree;
281 const char *p = NULL;
282 bool wildcard;
283 int lenght = 0;
284
285 if (!Tree) {
286 fprintf(stderr, "Error: Tree does not exists!\n");
287 return false;
288 }
289
290 p = tld + tld_len - 1;
291 while (tld_len-- > 0) {
292 wildcard = false;
293 pNode = pNode->kid;
294
295 while( pNode && (pNode->c != *p) ) {
296 if (pNode->c == '*') {
297 wildcard = true;
298 }
299 pNode = pNode->sibling;
300 }
301
302 if( ! pNode ) {
303 if( wildcard ) {
304 while( tld_len-- ) {
305 if( tld_len && (*(--p) == '.') ) {
306 return false;
307 }
308 }
309 return true;
310 }
311 return false;
312 }
313
314 p--;
315 }
316
317 if( pNode->EoT ) {
318 return true;
319 }
320
321 return false;
322 }
323
324 /*
325 * Return the starting position of the tld in host or -1 if not found
326 * Require a TLD Tree.
327 * Ex:
328 * - google.com => 7 (='com')
329 * - abc.co.uk => 4 (='co.uk')
330 * - google.nawak => -1 (NOT FOUND)
331 *
332 */
faup_tld_tree_extract(faup_handler_t * fh,TLDNode * tld_tree)333 faup_tld_tree_extracted_t faup_tld_tree_extract(faup_handler_t *fh, TLDNode *tld_tree)
334 {
335 const char *p;
336 int32_t host_last_pos;
337 char *last;
338 bool found;
339 uint32_t step = 0;
340 uint32_t tld_len = 0;
341 uint32_t tld_exception_len = 0;
342 uint32_t last_len_just_for_host = 0;
343 size_t last_len;
344 size_t org_str_len;
345
346 faup_tld_tree_extracted_t tld_extracted;
347
348 uint32_t counter;
349 bool has_a_dot = false;
350
351 tld_extracted.pos = -1;
352 tld_extracted.size = 0;
353
354 if (!tld_tree) {
355 fprintf(stderr, "(Error) No TLD Tree given!\n");
356 return tld_extracted;
357 }
358
359 if (!tld_tree->kid) {
360 fprintf(stderr, "(Warning) Cannot extract TLD > 1. Mozilla list does not exists. Please download it (faup -u)\n");
361 return tld_extracted;
362 }
363
364 last = NULL;
365
366 host_last_pos = fh->faup.features.host.pos + fh->faup.features.host.size;
367
368 p = fh->faup.org_str + host_last_pos - 1;
369
370 counter = fh->faup.features.host.size - 1;
371 while( counter )
372 {
373 while( *(p-1) && (*p != '.') ) {
374 p--;
375 counter--;
376 if (counter <= 0) break;
377 }
378
379 step = ( *p == '.' ) ? 1 : 0;
380
381 found = faup_tld_tree_tld_exists(tld_tree->kid, p + step, fh->faup.features.host.size - counter - 1);
382 if( ! found ) {
383 break;
384 } else {
385 tld_len = fh->faup.features.host.size - counter - 1;
386 }
387
388 last = (char *) p + step;
389
390 p--;
391 counter--;
392 }
393
394 if( last == NULL ) {
395 return tld_extracted;
396 }
397
398
399 // We want to retrieve the size of the tld without the useless chars the come afterwards
400 // www.foo.siemens.om/tagada != www.foo.siemens.om
401 last_len = counter;
402 last_len_just_for_host = last_len - (fh->faup.org_str_len - (fh->faup.features.host.pos + fh->faup.features.host.size));
403
404 counter = 0;
405 if (tld_tree->sibling) {
406 found = faup_tld_tree_tld_exists(tld_tree->sibling, last, last_len_just_for_host);
407 if( found )
408 {
409 // here we have the longest TLD
410 // but is that an exception ? (ex: !siemens.om vs *.om)
411 while (counter < tld_len) {
412 if (*last != '.') {
413 last++;
414 tld_exception_len++;
415 } else {
416 has_a_dot = true;
417 break;
418 }
419 counter++;
420 }
421 }
422 }
423
424 tld_extracted.size = tld_len - tld_exception_len;
425 if (!tld_extracted.size) {
426 tld_extracted.size = tld_len;
427 }
428
429 //printf("fh->faup.features.host.pos(%zd), fh->faup.features.host.size(%zd), tld_extracted.size(%zd), counter(%zd)\n", fh->faup.features.host.pos, fh->faup.features.host.size, tld_extracted.size, counter);
430 tld_extracted.pos = fh->faup.features.host.pos + fh->faup.features.host.size - tld_extracted.size;
431
432 if (has_a_dot) {
433 tld_extracted.pos += 1;
434 tld_extracted.size -= 1;
435 }
436
437 // printf("tld_extracted.size=%zd;tld_extracted.pos=%zd\n", tld_extracted.size, tld_extracted.pos);
438
439 return tld_extracted;
440 }
441
442