1 /*
2 * genTable.c - URLfilterDB
3 *
4 * ufdbGuard is copyrighted (C) 2005-2020 by URLfilterDB B.V. with all rights reserved.
5 *
6 * Parts of the ufdbGuard daemon are based on squidGuard.
7 * This module is NOT based on squidGuard.
8 *
9 * Generate a binary table file (.ufdb) from unordered ASCII files
10 * with domains and urls.
11 *
12 * usage: ufdbGenTable [-F 2.0|2.1|2.2|2.3|3.0] [-V] [-n] [-C | -Z] [-X] [-k <key>] -t <tableName> -d <domains> [-u <urls>]
13 *
14 * RCS $Id: genTable.c,v 1.164 2020/10/22 07:12:36 root Exp root $
15 */
16
17 #define UFDB_DO_DEBUG 0
18
19 /* ufdbGenTable needs speed! */
20 #undef _FORTIFY_SOURCE
21
22 #if UFDB_OVERRIDE_GCC_OPT && !UFDB_DO_DEBUG && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
23 #pragma GCC optimize ("O3")
24 #endif
25
26 #if !UFDB_DO_DEBUG && defined(__OPTIMIZE__) && 0
27 #define __USE_STRING_INLINES 1
28 #endif
29
30
31 #include "ufdb.h"
32 #include "ufdblib.h"
33 #include "ufdbdb.h"
34 #include "ufdblocks.h"
35 #if UFDBSS_RESTAPI
36 #include "ufdbstrlib.h"
37 #endif
38
39 #include <stdio.h>
40 #include <stdarg.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <strings.h>
44 #include <ctype.h>
45 #include <limits.h>
46 #include <time.h>
47 #include <errno.h>
48 #include <sys/types.h>
49 #include <unistd.h>
50
51 #if UFDB_BZ2LIB_SUPPORT
52 #include "bzlib.h"
53 #endif
54 #include "zlib.h"
55
56 #ifdef __cplusplus
57 extern "C" {
58 #endif
59
60 #define strmatch2(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == '\0')
61 #define strmatch3(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == '\0')
62 #define strmatch4(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == '\0')
63 #define strmatch5(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == '\0')
64 #define strmatch6(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == '\0')
65 #define strmatch7(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == (b)[6] && (a)[7] == '\0')
66 #define strmatch8(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == (b)[6] && (a)[7] == (b)[7] && (a)[8] == '\0')
67 #define strmatch9(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == (b)[6] && (a)[7] == (b)[7] && (a)[8] == (b)[8] && (a)[9] == '\0')
68 #define strmatch10(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == (b)[6] && (a)[7] == (b)[7] && (a)[8] == (b)[8] && (a)[9] == (b)[9] && (a)[10] == '\0')
69 #define strmatch11(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == (b)[6] && (a)[7] == (b)[7] && (a)[8] == (b)[8] && (a)[9] == (b)[9] && (a)[10] == (b)[10] && (a)[11] == '\0')
70 #define strmatch12(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == (b)[6] && (a)[7] == (b)[7] && (a)[8] == (b)[8] && (a)[9] == (b)[9] && (a)[10] == (b)[10] && (a)[11] == (b)[11] && (a)[12] == '\0')
71 #define strmatch13(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == (b)[6] && (a)[7] == (b)[7] && (a)[8] == (b)[8] && (a)[9] == (b)[9] && (a)[10] == (b)[10] && (a)[11] == (b)[11] && (a)[12] == (b)[12] && (a)[13] == '\0')
72
73
74 /* This is an attempt to define a new algorithm for generating a table.
75 * The reason for the need of a new algorithm is performance.
76 * In December 2013 the adult category has 3076456 domains, 276707 urls,
77 * 3353163 leaf nodes and 207536 index nodes:
78 * #sub1: 156411 #sub2: 23879 #sub3: 11226 #sub4: 4923
79 * #sub5: 2275 #sub6: 1269 #sub7: 847
80 * #sub8+: 6443 #sub255+: 260 #sub64K+: 3
81 *
82 * On a Intel E5-2420 processor ufdbGenTable takes 32 seconds to generate a .ufdb table file
83 * where 3 seconds are needed for composing an in-memory table of the 3 million domains
84 * and 29 seconds are needed to insert the 275000 urls.
85 * The total time to generate is too much and will only grow when the adult table grows.
86 * The time to insert the relatively small amount of urls is simply too much and
87 * caused by inserting elements in an array with 500000+ elements. The array shift
88 * is taking 90+ per cent of all CPU time.
89 * The .com TLD has 539320 child nodes and the tumblr.com domain has 691892 child nodes.
90 * Note that the L3 cache of the E5-2420 is 15 MB and that 700000 * 32 bytes (array element size)
91 * is 21.3 MB and hence significantly larger than the L3 cache.
92 *
93 * The input files are 99% sorted so processing them uses a lot of fast tail insertions
94 * except for IP addresses. Therefore, processing the domains file is fast.
95 * The processing of the urls file is slow since it causes inserts at random points
96 * in the table and therefore causes a lot of array shifts.
97 *
98 * The new algorithm must be much faster: the target is to generate the adult table
99 * in under 5 seconds.
100 *
101 * The new data structure is based upon the old one with the difference that
102 * the large array will be divided in a list of small arrays which are fast to manipulate.
103 * The list of arrays is a kind of a btree. Each array has a variable size with a maximum of N.
104 * N is choosen to be 2000 since 2000*32 = 64000 and is sufficiently small to not
105 * cause much stress for the L2 cache (the L2 cache size is often 256K).
106 * The array size must be small to make sure that an insert in the array is fast so
107 * maybe N must be reduced to 1000 or less. Experiments with different values were done
108 * and showed a slight performance degradation with smaller and higher values.
109 * Each array is filled and when it becomes full, it is split into 2 arrays where
110 * the two new arrays each have 50% of the elements of the split arrays, EXCEPT
111 * when the array is the last array of the list. Since there is a lot of tail insertion,
112 * the last array of the list, when it becomes full, is split into one array with N-1
113 * elements and the new last array will have 1 element.
114 *
115 * With N=2000 and a list of arrays with a total of 700000 elements has on average
116 * 700 arrays.
117 *
118 * The data structure used by ufdbGenTable is for ufdbGenTable only and is not used by
119 * the database load functions of ufdbguardd.
120 *
121 * The results are good:
122 * ufdbGentable version time (s)
123 * 1.31 32.5
124 * 1.32 new algorithm 3.0
125 */
126
127 #define MinNodesPerSTA 1
128 #define MaxNodesPerSTA 2000
129
130 struct UFDBgentableNode;
131 struct UFDBgentableSTA;
132
133
134 struct UFDBgentableNode /* 32 bytes */
135 {
136 unsigned int totalSubNodes; /* sum(stas.nSubNodes) */
137 unsigned int nSTA; /* number of STAs */
138 unsigned char * tag; /* tag */
139 struct UFDBgentableSTA * stas; /* Subnodes Table Arrays (STAs) */
140 };
141
142 struct UFDBgentableSTA /* 16 bytes */
143 {
144 unsigned int nSubNodes; /* #used nodes; n<nodeArraySize */
145 unsigned int nodeArraySize; /* 0, 1, 8, 256 or 2000 */
146 struct UFDBgentableNode * subNodes; /* not a pointer but an array! */
147 };
148
149 static FILE * fin;
150 static char * domainsFileName;
151 static char * urlsFileName;
152 static char * tableName;
153 static char endian = 'L';
154
155 static struct UFDBgentableNode * table = NULL;
156
157 #if 0
158 int ufdbGV.debug = 0;
159 #endif
160
161 static int printStatistics = 0;
162 static int utf8support = 1;
163 static long numEntries = 0;
164 static long numNodes = 0;
165 static long numLeafNodes = 0;
166 static long numSub1 = 0;
167 static long numSub2 = 0;
168 static long numSub3 = 0;
169 static long numSub4 = 0;
170 static long numSub5 = 0;
171 static long numSub6 = 0;
172 static long numSub7 = 0;
173 static long numSub8 = 0;
174 static long numSub8plus = 0;
175 static long numSub255plus = 0;
176 static long numSub64Kplus = 0;
177 static long numIndexNodes = 0;
178 #if UFDB_DBFORMAT_3
179 static long numLabelNodes = 0;
180 static long numChnksStat = 0;
181 static long numOverflow = 0;
182 static long numFewLabels = 0;
183 static long numVeryFewPaths = 0;
184 static long numFewPaths = 0;
185 static long num250Paths = 0;
186 static long numManyPaths = 0;
187 #endif
188 static int numWarnings = 0;
189 static int doCrypt = 1;
190 static int doBZ2compress = 0;
191 static int doZLIBcompress = 0;
192 static int doProd = 0;
193 static int doPadding = 0;
194 static int doWarnings = 1;
195 static int doSanityCheck = 1;
196 static int urlsIncluded = 1;
197 static int stripWWW = 0;
198 static int skipOptimisations = 0;
199 static int URLparamSupport = 0;
200 static char * format = UFDBdefaultdbVersion;
201
202 #if UFDB_DO_DEBUG || 0
203 #define DEBUG(x) fprintf x
204 #else
205 #define DEBUG(x)
206 #endif
207
208 #define ROUNDUPBY 16
209 #define ROUNDUP(i) ( (i) + (ROUNDUPBY - ((i)%ROUNDUPBY) ) )
210
211 #define BIGROUNDUPBY 128
212 #define BIGROUNDUP(i) ( (i) + (BIGROUNDUPBY - ((i)%BIGROUNDUPBY) ) )
213
214 #define ROUNDUPBYCUTOFF BIGROUNDUPBY
215
216 #include "strcmpurlpart.static.c"
217
218
219
220 #if HAVE_PUTC_UNLOCKED
221
222 #define myfast_putc(c,fp) putc_unlocked(c,fp)
223 #define myfast_getc(fp) getc_unlocked(fp)
224
225 #if defined(__linux__) && defined(_GNU_SOURCE)
226 #define fast_puts(s,fp) fputs_unlocked(s,fp)
227 #else
fast_puts(const char * s,FILE * fp)228 UFDB_GCC_INLINE int fast_puts( const char * s, FILE * fp )
229 {
230 int retval;
231
232 retval = 1;
233 while (*s != '\0' && ((retval = myfast_putc(*s,fp)) != EOF))
234 s++;
235
236 return retval;
237 }
238 #endif
239
240 #else
241
242 #define myfast_putc(c,fp) fputc(c,fp)
243 #define fast_puts(s,fp) fputs(s,fp)
244
245 #define myfast_getc(fp) fgetc(fp)
246 #endif
247
248 #define MAXLOGMSGSIZE (60*1024)
249
250
usage(void)251 static void usage( void )
252 {
253 fprintf( stderr, "usage: %s [-nNqV] [-C | -Z] [-k <key>] -t <tableName> -d <domains> [-u <urls>]\n",
254 ufdbGV.progname );
255 fprintf( stderr, "flags: -n no encryption\n" );
256 fprintf( stderr, " -k 16-char encryption key\n" );
257 #if UFDB_DBFORMAT_3
258 fprintf( stderr, " -F 2.2|3.0 (default is %s)\n", UFDBdefaultdbVersion );
259 // fprintf( stderr, " -B | -L generate big endian or little endian file format (default -L)\n" );
260 #else
261 fprintf( stderr, " -F 2.0|2.1|2.2 (default is %s)\n", UFDBdefaultdbVersion );
262 #endif
263 fprintf( stderr, " -t tablename\n" );
264 fprintf( stderr, " -d domains\n" );
265 fprintf( stderr, " -u urls\n" );
266 fprintf( stderr, " -U domains and URLs are in the file specified by -d\n" );
267 #if UFDB_BZ2LIB_SUPPORT
268 fprintf( stderr, " -C use bzip2 compression (relatively slow)\n" );
269 #endif
270 fprintf( stderr, " -D debug\n" );
271 fprintf( stderr, " -N NO UTF8 support (skip URLs with UTF8 characters)\n" );
272 fprintf( stderr, " -q be quiet (suppress warnings)\n" );
273 fprintf( stderr, " -s sanity check for domain names (obsolete option - check is always done)\n" );
274 fprintf( stderr, " -S print table statistics\n" );
275 fprintf( stderr, " -V print version (" UFDB_VERSION ") and exit\n" );
276 fprintf( stderr, " -W strip \"www.\" from URLs\n" );
277 fprintf( stderr, " -X skip table optimisations - only for expert users\n" );
278 fprintf( stderr, " -Z use zlib compression - up to 5 times faster than bzip2 but slightly less compression\n" );
279 exit( 1 );
280 }
281
282
randomChar(void)283 UFDB_GCC_INLINE static unsigned char randomChar( void )
284 {
285 static unsigned char * a = (unsigned char *) "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
286 return a[random() % 62];
287 }
288
289
randomisebuf64(unsigned char * buf)290 static void randomisebuf64( unsigned char * buf )
291 {
292 int n;
293 unsigned int seed;
294 FILE * fp;
295
296 seed = 0x05ac7326;
297
298 if (doProd)
299 {
300 fp = popen( "who", "r" );
301 if (fp != NULL)
302 {
303 int ch;
304 while ((ch = fgetc(fp)) != EOF)
305 {
306 seed = (seed >> 11) + 23 + 7 * (seed ^ ch);
307 }
308 fclose( fp );
309 }
310 }
311
312 #if 0
313 printf( "random seed is %08x\n", seed );
314 #endif
315
316 seed = (seed + 1309) ^ (((getpid() << 3) ^ (time(NULL) << 19)) + (getppid() << 26));
317 srandom( seed );
318
319 for (n = 0; n < 64; n++)
320 {
321 *buf++ = randomChar();
322 }
323 }
324
325
ufdbLogError(const char * format,...)326 void ufdbLogError( const char * format, ... )
327 {
328 va_list ap;
329 char msg[MAXLOGMSGSIZE];
330
331 va_start( ap, format );
332 if (vsnprintf(msg, MAXLOGMSGSIZE-1, format, ap) > (MAXLOGMSGSIZE - 2))
333 msg[MAXLOGMSGSIZE-1] = '\0';
334 va_end( ap );
335
336 fprintf( stderr, "ERROR: %s\n", msg );
337 }
338
339
ufdbLogMessage(const char * format,...)340 void ufdbLogMessage( const char * format, ... )
341 {
342 va_list ap;
343 char msg[MAXLOGMSGSIZE];
344
345 va_start( ap, format );
346 if (vsnprintf(msg, MAXLOGMSGSIZE-8, format, ap) >= (MAXLOGMSGSIZE - 8))
347 msg[MAXLOGMSGSIZE-8] = '\0';
348 va_end( ap );
349
350 fprintf( stderr, "%s\n", msg );
351 }
352
353
ufdbLogFatalError(const char * format,...)354 void ufdbLogFatalError( const char * format, ... )
355 {
356 va_list ap;
357 char msg[MAXLOGMSGSIZE];
358
359 va_start( ap, format );
360 if (vsnprintf(msg, MAXLOGMSGSIZE-1, format, ap) > (MAXLOGMSGSIZE - 2))
361 msg[MAXLOGMSGSIZE-1] = '\0';
362 va_end( ap );
363
364 fprintf( stderr, "FATAL ERROR: %s *****\n", msg );
365 }
366
367
ufdbSetGlobalErrorLogFile(char * logdir,char * basename,int mutex_is_used)368 void ufdbSetGlobalErrorLogFile(
369 char * logdir __attribute__((unused)),
370 char * basename __attribute__((unused)),
371 int mutex_is_used __attribute__((unused)) )
372 {
373 if (logdir == NULL) {;} // prevent compiler warning
374 if (basename == NULL) {;} // prevent compiler warning
375 if (mutex_is_used) {;} // prevent compiler warning
376 }
377
378
skipProtocol(unsigned char * URL)379 inline static unsigned char * skipProtocol( unsigned char * URL )
380 {
381 int i;
382
383 i = strcspn( (char*) URL, ".:/?@#%" );
384 if (*(URL+i) == ':' && *(URL+i+1) == '/' && *(URL+i+2) == '/')
385 return URL + i + 3;
386 return NULL;
387 }
388
389
gentableVerifyParameters(struct UFDBgentableNode * node,UFDBrevURL * revUrl)390 inline static int gentableVerifyParameters(
391 struct UFDBgentableNode * node,
392 UFDBrevURL * revUrl )
393 {
394 // prevent compiler warnings:
395 if (node == NULL) {;}
396 if (revUrl == NULL) {;}
397
398 // just assume that parameters are always different
399 return 0;
400 }
401
402
403 /* perform lookup of revUrl in the table pointed by its top node.
404 * return 1 iff found, 0 otherwise.
405 */
406 UFDB_GCC_HOT
UFDBgentableLookupRevUrl(struct UFDBgentableNode * node,UFDBrevURL * revUrl)407 static int UFDBgentableLookupRevUrl(
408 struct UFDBgentableNode * node,
409 UFDBrevURL * revUrl )
410 {
411 int b, e;
412 int i;
413 int cmp;
414
415 struct UFDBgentableSTA * sta;
416 #if UFDB_DO_DEBUG
417 struct UFDBgentableNode * origtable = node;
418 #endif
419
420 /* there is a 2-level search (both binary search):
421 * first, search the STA and second search within the STA.
422 */
423
424 begin:
425 DEBUG(( stderr, " UFDBgentableLookupRevUrl: table %-14s [%d] tag %14s : %s\n",
426 origtable->tag, origtable->nSTA, node->tag, revUrl->part ));
427
428 if (node->nSTA == 0)
429 return 0;
430
431 i = 0;
432 e = node->nSTA - 1;
433 if (e == 0)
434 {
435 sta = &node->stas[0];
436 DEBUG(( stderr, " use STA 0 (nnodes is 1)\n" ));
437 }
438 else
439 {
440 sta = NULL; /* suppress compiler warning */
441 b = 0;
442 while (b <= e)
443 {
444 i = (b + e) / 2;
445 sta = &node->stas[i];
446 cmp = strcmpURLpart( (char *) revUrl->part, (char *) sta->subNodes[0].tag );
447 DEBUG(( stderr, " last node tag in STA %-3d (nnodes is %d) %s \n",
448 i, sta->nSubNodes, sta->subNodes[sta->nSubNodes-1].tag ));
449 DEBUG(( stderr, " first node tag in STA %-3d strcmpURLpart( %s, %s ) is %d\n",
450 i, (char *) revUrl->part, (char *) sta->subNodes[0].tag, cmp ));
451 if (cmp < 0)
452 e = i - 1;
453 else if (cmp > 0)
454 b = i + 1;
455 else
456 break;
457 }
458 /* the above comparisons were done against sta[i].subnodes[0].tag and
459 * we may be one STA too far so compare the tag with sta[-1].subnodes[last].tag
460 */
461 if (i > 0)
462 {
463 struct UFDBgentableSTA * prevsta;
464 prevsta = &node->stas[i-1];
465 cmp = strcmpURLpart( (char *) revUrl->part, (char *) prevsta->subNodes[prevsta->nSubNodes-1].tag );
466 if (cmp <= 0)
467 {
468 i = i - 1;
469 sta = prevsta;
470 DEBUG(( stderr, " tag is in previous STA (%d)", i ));
471 }
472 }
473 }
474 DEBUG(( stderr, " UFDBgentableLookupRevUrl STA %d: nSubNodes %d\n", i, sta->nSubNodes ));
475
476 // the sta has been found. Now search inside the sta
477 b = 0;
478 e = sta->nSubNodes - 1;
479 while (b <= e)
480 {
481 i = (b + e) / 2;
482 cmp = strcmpURLpart( (char *) revUrl->part, (char *) sta->subNodes[i].tag );
483 DEBUG(( stderr, " i:%-3d strcmpURLpart( %s, %s ) is %d\n",
484 i, (char *) revUrl->part, (char *) sta->subNodes[i].tag, cmp ));
485 if (cmp < 0)
486 e = i - 1;
487 else if (cmp > 0)
488 b = i + 1;
489 else
490 {
491 int is_path = (revUrl->part[0] == '/');
492
493 node = &(sta->subNodes[i]);
494 if (node->totalSubNodes == 0) /* no more levels in table -> MATCH */
495 {
496 DEBUG(( stderr, " no more levels in table -> MATCH\n" ));
497 return 1;
498 }
499
500 if (is_path && node->totalSubNodes > 0)
501 {
502 DEBUG(( stderr, " is_path: gentableVerifyParameters()\n" ));
503 return gentableVerifyParameters( node, revUrl->next );
504 }
505
506 revUrl = revUrl->next;
507 if (revUrl == NULL) /* no more levels in URL -> NO match */
508 {
509 DEBUG(( stderr, " no more levels in URL -> NO match\n" ));
510 return 0;
511 }
512
513 goto begin;
514 }
515 }
516
517 DEBUG(( stderr, " not found\n" ));
518 return 0; /* not found */
519 }
520
521
522 static unsigned char * mem = NULL;
523 static unsigned long mem_i = 0;
524 static unsigned long mem_size = 0;
525
init_membuf(void)526 static void init_membuf( void )
527 {
528 mem_size = 2 * 1024 * 1024;
529 mem = (unsigned char *) malloc( mem_size );
530
531 /* starting with format 2.1 there are 64 random bytes inserted at the beginning
532 * of the crypted table as a defense against brute force decryption attempts.
533 */
534 if (doCrypt && strcmp( format, "2.1") >= 0)
535 {
536 randomisebuf64( mem );
537 mem_i += 64;
538 #if UFDB_DO_DEBUG
539 if (ufdbGV.debug)
540 fprintf( stderr, "inserted 64 random bytes at the start of the URL table\n" );
541 #endif
542 }
543 }
544
545
546 UFDB_GCC_HOT
mem_putc(unsigned char c)547 static void mem_putc( unsigned char c )
548 {
549 if (mem_i == mem_size)
550 {
551 mem_size += 2 * 1024 * 1024;
552 if (mem_size > 4 * UINT_MAX)
553 {
554 ufdbLogFatalError( "mem_putc: table is too large!" );
555 exit( 1 );
556 }
557 mem = (unsigned char *) realloc( mem, mem_size );
558 if (mem == NULL)
559 {
560 fprintf( stderr, "cannot allocate %lu bytes for the in-memory table.\n", mem_size );
561 exit( 1 );
562 }
563 }
564 mem[mem_i++] = c;
565 }
566
567
568 UFDB_GCC_HOT
mem_pad(void)569 static void mem_pad( void )
570 {
571 if (mem_i + 8 >= mem_size)
572 {
573 mem_size += 2 * 1024 * 1024;
574 mem = (unsigned char *) realloc( mem, mem_size );
575 if (mem == NULL)
576 {
577 fprintf( stderr, "cannot allocate %lu bytes for the in-memory table.\n", mem_size );
578 exit( 1 );
579 }
580 }
581
582 while (mem_i % 8 > 0)
583 mem[mem_i++] = UFDBpadTable;
584 }
585
586
587 UFDB_GCC_HOT UFDB_GCC_INLINE
mem_puts(unsigned char * s)588 static void mem_puts( unsigned char * s )
589 {
590 if (mem_i + sizeof(UFDBurlPart) >= mem_size)
591 {
592 mem_size += 2 * 1024 * 1024;
593 if (mem_size > 4 * UINT_MAX)
594 {
595 ufdbLogFatalError( "mem_puts: table is too large!" );
596 exit( 1 );
597 }
598 mem = (unsigned char *) realloc( mem, mem_size );
599 if (mem == NULL)
600 {
601 fprintf( stderr, "cannot allocate %lu bytes for the in-memory table.\n", mem_size );
602 exit( 1 );
603 }
604 }
605
606 while (*s != '\0')
607 mem[mem_i++] = *s++;
608 }
609
610
611 UFDB_GCC_HOT UFDB_GCC_INLINE
mem_putsnt(unsigned char * s)612 static void mem_putsnt( unsigned char * s )
613 {
614 if (mem_i + sizeof(UFDBurlPart) >= mem_size)
615 {
616 mem_size += 2 * 1024 * 1024;
617 if (mem_size > 4 * UINT_MAX)
618 {
619 ufdbLogFatalError( "mem_putsnt: table is too large!" );
620 exit( 1 );
621 }
622 mem = (unsigned char *) realloc( mem, mem_size );
623 if (mem == NULL)
624 {
625 fprintf( stderr, "cannot allocate %lu bytes for the in-memory table.\n", mem_size );
626 exit( 1 );
627 }
628 }
629
630 while (*s != '\0')
631 mem[mem_i++] = *s++;
632 mem[mem_i++] = '\0';
633 }
634
635
636 #if UFDB_DBFORMAT_3
637 UFDB_GCC_HOT
mem_alloc4(unsigned long nbytes)638 static uint64_t mem_alloc4( unsigned long nbytes ) // returns a byte offset
639 {
640 unsigned long new_mem_i = 0;
641 new_mem_i = (mem_i + 3) & ~3; // round up to multiple of 4
642 if (new_mem_i != mem_i)
643 {
644 while (mem_i != new_mem_i)
645 mem_putc( '\0' );
646 }
647 uint64_t m = mem_i;
648 if (ufdbGV.debug)
649 ufdbLogMessage( " mem_alloc4 %5lu bytes byte-offset %-6lu table-offset %lu", nbytes, m, m>>2 );
650
651 if (mem_i + nbytes >= mem_size)
652 {
653 if (nbytes >= 2 * 1024 * 1024)
654 mem_size += (nbytes + 2 * 1024 * 1024);
655 else
656 mem_size += 2 * 1024 * 1024;
657 if (mem_size > 4 * UINT_MAX)
658 {
659 ufdbLogFatalError( "mem_alloc4: table is too large!" );
660 exit( 1 );
661 }
662 mem = realloc( mem, mem_size );
663 if (mem == NULL)
664 {
665 fprintf( stderr, "mem_alloc4: cannot allocate %lu bytes for the in-memory table.\n", mem_size );
666 exit( 1 );
667 }
668 if (ufdbGV.debug)
669 ufdbLogMessage( "mem_alloc4: realloc to size %'ld", mem_size );
670 }
671
672 mem_i += nbytes;
673 return m;
674 }
675 #endif
676
677
678 #if 0
679 UFDB_GCC_HOT
680 static uint64_t mem_alloc16( unsigned long nbytes ) // returns a byte offset
681 {
682 unsigned long new_mem_i = 0;
683 new_mem_i = (mem_i + 15) & ~15; // round up to multiple of 16
684 if (new_mem_i != mem_i)
685 {
686 while (mem_i != new_mem_i)
687 mem_putc( '\0' );
688 }
689 uint64_t m = mem_i;
690 if (ufdbGV.debug)
691 ufdbLogMessage( " mem_alloc16 %4lu bytes byte-offset %-6lu table-offset %lu", nbytes, m, m>>2 );
692
693 if (mem_i + nbytes >= mem_size)
694 {
695 // caller may request 600 KB and adding 256 KB won't do!!
696 if (nbytes >= 256 * 1024)
697 mem_size += (nbytes + 256 * 1024);
698 else if (mem_size < 1024 * 1024)
699 mem_size += 256 * 1024;
700 else
701 mem_size += 2 * 1024 * 1024;
702 if (mem_size > 4 * UINT_MAX)
703 {
704 ufdbLogFatalError( "mem_alloc16: table is too large!" );
705 exit( 1 );
706 }
707 mem = realloc( mem, mem_size );
708 if (mem == NULL)
709 {
710 fprintf( stderr, "mem_alloc16: cannot allocate %lu bytes for the in-memory table.\n", mem_size );
711 exit( 1 );
712 }
713 }
714
715 mem_i += nbytes;
716 return m;
717 }
718 #endif
719
720
721 #if 0
722 #define _STRDUP(s) ufdbStrdup(s)
723 #else
724 /*
725 * Small speed optimisation: allocate memory for strdupped strings in large blocks since they are never freed.
726 */
727 UFDB_GCC_HOT UFDB_GCC_INLINE
_STRDUP(char * s)728 static char * _STRDUP(
729 char * s )
730 {
731 static char * freeMem = NULL;
732 static char * last = NULL;
733
734 char * f;
735 char * p;
736
737 if ((int) (last - freeMem) < (int) sizeof(UFDBurlPart))
738 {
739 freeMem = (char *) malloc( 256 * 1024 );
740 last = freeMem + 256 * 1024 - 1;
741 }
742 f = freeMem;
743
744 p = (char *) memccpy( freeMem, s, '\0', sizeof(UFDBurlPart) );
745 if (p == NULL)
746 {
747 freeMem += sizeof(UFDBurlPart);
748 *freeMem++ = '\0';
749 }
750 else
751 freeMem = p;
752
753 return f;
754 }
755 #endif
756
757
758 UFDB_GCC_HOT
UFDBsanityCheckDomainname(char * url)759 static int UFDBsanityCheckDomainname( char * url )
760 {
761 unsigned char * s;
762 char * first_slash;
763 char * tld;
764 int retval;
765 char * oldBracket;
766
767 #if 0
768 fprintf( stderr, "UFDBsanityCheckDomainname: %s\n", url );
769 #endif
770
771 if (*url == '[') /* IPv6 address */
772 {
773 char normalisedDomain[64];
774
775 oldBracket = strchr( url, ']' );
776 if (oldBracket == NULL)
777 {
778 fprintf( stderr, "error: IPv6 address has no closing ']': %s\n", url );
779 return 0;
780 }
781 *oldBracket = '\0';
782 if (UFDBparseIPv6address( url+1, normalisedDomain ) == NULL)
783 {
784 *oldBracket = ']';
785 return 0;
786 }
787 else
788 {
789 *oldBracket = ']';
790 UFDBupdateURLwithNormalisedDomain( url, normalisedDomain );
791 return 1;
792 }
793 }
794
795 first_slash = strchr( url, '/' );
796 if (first_slash != NULL)
797 *first_slash = '\0';
798
799 tld = strrchr( url, '.' );
800 if (tld == NULL)
801 tld = url;
802 else
803 tld++;
804
805 retval = 1;
806 if (*tld >= '0' && *tld <= '9')
807 ;
808 else
809 if (
810 strmatch2( tld, "ac" ) != 0 &&
811 strmatch2( tld, "ad" ) != 0 &&
812 strmatch3( tld, "ads" ) != 0 &&
813 strmatch5( tld, "adult" ) != 0 &&
814 strmatch2( tld, "ae" ) != 0 &&
815 strmatch4( tld, "aero" ) != 0 &&
816 strmatch2( tld, "af" ) != 0 &&
817 strmatch2( tld, "ag" ) != 0 &&
818 strmatch6( tld, "agency" ) != 0 &&
819 strmatch2( tld, "ai" ) != 0 &&
820 strmatch2( tld, "al" ) != 0 &&
821 strmatch2( tld, "am" ) != 0 &&
822 strmatch9( tld, "amsterdam" ) != 0 &&
823 strmatch2( tld, "an" ) != 0 &&
824 strmatch2( tld, "ao" ) != 0 &&
825 strmatch3( tld, "app" ) != 0 &&
826 strmatch5( tld, "apple" ) != 0 &&
827 strmatch2( tld, "aq" ) != 0 &&
828 strmatch2( tld, "ar" ) != 0 &&
829 strmatch4( tld, "arpa" ) != 0 &&
830 strmatch2( tld, "as" ) != 0 &&
831 strmatch4( tld, "asia" ) != 0 &&
832 strmatch2( tld, "at" ) != 0 &&
833 strmatch2( tld, "au" ) != 0 &&
834 strmatch5( tld, "audio" ) != 0 &&
835 strmatch2( tld, "aw" ) != 0 &&
836 strmatch2( tld, "ax" ) != 0 &&
837 strmatch2( tld, "az" ) != 0 &&
838 strmatch2( tld, "ba" ) != 0 &&
839 strmatch6( tld, "bayern" ) != 0 &&
840 strmatch2( tld, "bb" ) != 0 &&
841 strmatch2( tld, "bd" ) != 0 &&
842 strmatch2( tld, "be" ) != 0 &&
843 strmatch6( tld, "berlin" ) != 0 &&
844 strmatch2( tld, "bf" ) != 0 &&
845 strmatch2( tld, "bg" ) != 0 &&
846 strmatch2( tld, "bh" ) != 0 &&
847 strmatch6( tld, "bharat" ) != 0 &&
848 strmatch2( tld, "bi" ) != 0 &&
849 strmatch4( tld, "bike" ) != 0 &&
850 strmatch3( tld, "biz" ) != 0 &&
851 strmatch2( tld, "bj" ) != 0 &&
852 strmatch4( tld, "blue" ) != 0 &&
853 strmatch2( tld, "bm" ) != 0 &&
854 strmatch2( tld, "bn" ) != 0 &&
855 strmatch10( tld, "bnpparibas" ) != 0 &&
856 strmatch2( tld, "bo" ) != 0 &&
857 strmatch2( tld, "br" ) != 0 &&
858 strmatch8( tld, "bradesco" ) != 0 &&
859 strmatch8( tld, "brussels" ) != 0 &&
860 strmatch2( tld, "bs" ) != 0 &&
861 strmatch2( tld, "bt" ) != 0 &&
862 strmatch2( tld, "bv" ) != 0 &&
863 strmatch2( tld, "bw" ) != 0 &&
864 strmatch2( tld, "by" ) != 0 &&
865 strmatch2( tld, "bz" ) != 0 &&
866 strmatch3( tld, "bzh" ) != 0 &&
867 strmatch2( tld, "ca" ) != 0 &&
868 strmatch4( tld, "cafe" ) != 0 &&
869 strmatch6( tld, "camera" ) != 0 &&
870 strmatch6( tld, "casino" ) != 0 &&
871 strmatch3( tld, "cat" ) != 0 &&
872 strmatch2( tld, "cc" ) != 0 &&
873 strmatch2( tld, "cd" ) != 0 &&
874 strmatch6( tld, "center" ) != 0 &&
875 strmatch2( tld, "cf" ) != 0 &&
876 strmatch2( tld, "cg" ) != 0 &&
877 strmatch2( tld, "ch" ) != 0 &&
878 strmatch5( tld, "cheap" ) != 0 &&
879 strmatch2( tld, "ci" ) != 0 &&
880 strmatch2( tld, "ck" ) != 0 &&
881 strmatch2( tld, "cl" ) != 0 &&
882 strmatch4( tld, "club" ) != 0 &&
883 strmatch2( tld, "cm" ) != 0 &&
884 strmatch2( tld, "cn" ) != 0 &&
885 strmatch2( tld, "co" ) != 0 &&
886 strmatch5( tld, "coach" ) != 0 &&
887 strmatch5( tld, "codes" ) != 0 &&
888 strmatch3( tld, "com" ) != 0 &&
889 strmatch7( tld, "company" ) != 0 &&
890 strmatch4( tld, "coop" ) != 0 &&
891 strmatch7( tld, "country" ) != 0 &&
892 strmatch2( tld, "cr" ) != 0 &&
893 strmatch7( tld, "cricket" ) != 0 &&
894 strmatch2( tld, "cu" ) != 0 &&
895 strmatch2( tld, "cu" ) != 0 &&
896 strmatch2( tld, "cv" ) != 0 &&
897 strmatch2( tld, "cx" ) != 0 &&
898 strmatch2( tld, "cy" ) != 0 &&
899 strmatch2( tld, "cz" ) != 0 &&
900 strmatch6( tld, "dating" ) != 0 &&
901 strmatch2( tld, "de" ) != 0 &&
902 strmatch6( tld, "degree" ) != 0 &&
903 strmatch4( tld, "desi" ) != 0 &&
904 strmatch9( tld, "directory" ) != 0 &&
905 strmatch2( tld, "dj" ) != 0 &&
906 strmatch2( tld, "dk" ) != 0 &&
907 strmatch2( tld, "dm" ) != 0 &&
908 strmatch2( tld, "do" ) != 0 &&
909 strmatch2( tld, "do" ) != 0 &&
910 strmatch2( tld, "dz" ) != 0 &&
911 strmatch2( tld, "ec" ) != 0 &&
912 strmatch3( tld, "edu" ) != 0 &&
913 strmatch2( tld, "ee" ) != 0 &&
914 strmatch2( tld, "eg" ) != 0 &&
915 strmatch5( tld, "email" ) != 0 &&
916 strmatch2( tld, "er" ) != 0 &&
917 strmatch2( tld, "es" ) != 0 &&
918 strmatch2( tld, "et" ) != 0 &&
919 strmatch2( tld, "eu" ) != 0 &&
920 strmatch3( tld, "eus" ) != 0 &&
921 strmatch6( tld, "expert" ) != 0 &&
922 strmatch2( tld, "fi" ) != 0 &&
923 strmatch7( tld, "fitness" ) != 0 &&
924 strmatch2( tld, "fj" ) != 0 &&
925 strmatch2( tld, "fk" ) != 0 &&
926 strmatch2( tld, "fm" ) != 0 &&
927 strmatch2( tld, "fm" ) != 0 &&
928 strmatch2( tld, "fo" ) != 0 &&
929 strmatch8( tld, "football" ) != 0 &&
930 strmatch10( tld, "foundation" ) != 0 &&
931 strmatch2( tld, "fr" ) != 0 &&
932 strmatch2( tld, "fr" ) != 0 &&
933 strmatch2( tld, "ga" ) != 0 &&
934 strmatch7( tld, "gallery" ) != 0 &&
935 strmatch2( tld, "gb" ) != 0 &&
936 strmatch2( tld, "gd" ) != 0 &&
937 strmatch2( tld, "ge" ) != 0 &&
938 strmatch4( tld, "gent" ) != 0 &&
939 strmatch2( tld, "gf" ) != 0 &&
940 strmatch2( tld, "gg" ) != 0 &&
941 strmatch2( tld, "gg" ) != 0 &&
942 strmatch2( tld, "gh" ) != 0 &&
943 strmatch2( tld, "gi" ) != 0 &&
944 strmatch2( tld, "gi" ) != 0 &&
945 strmatch2( tld, "gl" ) != 0 &&
946 strmatch5( tld, "glass" ) != 0 &&
947 strmatch2( tld, "gm" ) != 0 &&
948 strmatch5( tld, "gmail" ) != 0 &&
949 strmatch2( tld, "gn" ) != 0 &&
950 strmatch4( tld, "golf" ) != 0 &&
951 strmatch3( tld, "gov" ) != 0 &&
952 strmatch2( tld, "gp" ) != 0 &&
953 strmatch2( tld, "gq" ) != 0 &&
954 strmatch2( tld, "gr" ) != 0 &&
955 strmatch8( tld, "graphics" ) != 0 &&
956 strmatch6( tld, "gratis" ) != 0 &&
957 strmatch2( tld, "gs" ) != 0 &&
958 strmatch2( tld, "gt" ) != 0 &&
959 strmatch2( tld, "gt" ) != 0 &&
960 strmatch2( tld, "gu" ) != 0 &&
961 strmatch5( tld, "guide" ) != 0 &&
962 strmatch4( tld, "guru" ) != 0 &&
963 strmatch2( tld, "gw" ) != 0 &&
964 strmatch2( tld, "gy" ) != 0 &&
965 strmatch4( tld, "help" ) != 0 &&
966 strmatch2( tld, "hk" ) != 0 &&
967 strmatch2( tld, "hm" ) != 0 &&
968 strmatch2( tld, "hn" ) != 0 &&
969 strmatch8( tld, "holdings" ) != 0 &&
970 strmatch4( tld, "host" ) != 0 &&
971 strmatch3( tld, "how" ) != 0 &&
972 strmatch2( tld, "hr" ) != 0 &&
973 strmatch2( tld, "ht" ) != 0 &&
974 strmatch2( tld, "hu" ) != 0 &&
975 strmatch2( tld, "id" ) != 0 &&
976 strmatch2( tld, "ie" ) != 0 &&
977 strmatch2( tld, "il" ) != 0 &&
978 strmatch2( tld, "im" ) != 0 &&
979 strmatch2( tld, "in" ) != 0 &&
980 strmatch4( tld, "info" ) != 0 &&
981 strmatch3( tld, "ink" ) != 0 &&
982 strmatch3( tld, "int" ) != 0 &&
983 strmatch13( tld, "international" ) != 0 &&
984 strmatch11( tld, "investments" ) != 0 &&
985 strmatch2( tld, "io" ) != 0 &&
986 strmatch2( tld, "iq" ) != 0 &&
987 strmatch2( tld, "ir" ) != 0 &&
988 strmatch2( tld, "is" ) != 0 &&
989 strmatch2( tld, "it" ) != 0 &&
990 strmatch2( tld, "je" ) != 0 &&
991 strmatch5( tld, "jetzt" ) != 0 &&
992 strmatch2( tld, "jm" ) != 0 &&
993 strmatch2( tld, "jo" ) != 0 &&
994 strmatch4( tld, "jobs" ) != 0 &&
995 strmatch2( tld, "jp" ) != 0 &&
996 strmatch6( tld, "kaufen" ) != 0 &&
997 strmatch2( tld, "ke" ) != 0 &&
998 strmatch2( tld, "kg" ) != 0 &&
999 strmatch2( tld, "kh" ) != 0 &&
1000 strmatch2( tld, "ki" ) != 0 &&
1001 strmatch2( tld, "km" ) != 0 &&
1002 strmatch2( tld, "kn" ) != 0 &&
1003 strmatch5( tld, "koeln" ) != 0 &&
1004 strmatch2( tld, "kr" ) != 0 &&
1005 strmatch2( tld, "kw" ) != 0 &&
1006 strmatch2( tld, "ky" ) != 0 &&
1007 strmatch2( tld, "kz" ) != 0 &&
1008 strmatch2( tld, "la" ) != 0 &&
1009 strmatch4( tld, "land" ) != 0 &&
1010 strmatch2( tld, "lb" ) != 0 &&
1011 strmatch2( tld, "lc" ) != 0 &&
1012 strmatch2( tld, "li" ) != 0 &&
1013 strmatch4( tld, "link" ) != 0 &&
1014 strmatch2( tld, "lk" ) != 0 &&
1015 strmatch3( tld, "lol" ) != 0 &&
1016 strmatch6( tld, "london" ) != 0 &&
1017 strmatch4( tld, "love" ) != 0 &&
1018 strmatch2( tld, "lr" ) != 0 &&
1019 strmatch2( tld, "ls" ) != 0 &&
1020 strmatch2( tld, "lt" ) != 0 &&
1021 strmatch2( tld, "lu" ) != 0 &&
1022 strmatch2( tld, "lv" ) != 0 &&
1023 strmatch2( tld, "ly" ) != 0 &&
1024 strmatch2( tld, "ma" ) != 0 &&
1025 strmatch2( tld, "mc" ) != 0 &&
1026 strmatch2( tld, "md" ) != 0 &&
1027 strmatch2( tld, "me" ) != 0 &&
1028 strmatch5( tld, "media" ) != 0 &&
1029 strmatch8( tld, "memorial" ) != 0 &&
1030 strmatch4( tld, "menu" ) != 0 &&
1031 strmatch2( tld, "mg" ) != 0 &&
1032 strmatch2( tld, "mh" ) != 0 &&
1033 strmatch3( tld, "mil" ) != 0 &&
1034 strmatch2( tld, "mk" ) != 0 &&
1035 strmatch2( tld, "ml" ) != 0 &&
1036 strmatch2( tld, "mm" ) != 0 &&
1037 strmatch2( tld, "mn" ) != 0 &&
1038 strmatch2( tld, "mo" ) != 0 &&
1039 strmatch4( tld, "mobi" ) != 0 &&
1040 strmatch6( tld, "mobile" ) != 0 &&
1041 strmatch4( tld, "moda" ) != 0 &&
1042 strmatch3( tld, "moe" ) != 0 &&
1043 strmatch2( tld, "mp" ) != 0 &&
1044 strmatch2( tld, "mr" ) != 0 &&
1045 strmatch2( tld, "ms" ) != 0 &&
1046 strmatch2( tld, "mt" ) != 0 &&
1047 strmatch2( tld, "mt" ) != 0 &&
1048 strmatch2( tld, "mu" ) != 0 &&
1049 strmatch6( tld, "museum" ) != 0 &&
1050 strmatch5( tld, "music" ) != 0 &&
1051 strmatch2( tld, "mv" ) != 0 &&
1052 strmatch2( tld, "mw" ) != 0 &&
1053 strmatch2( tld, "mx" ) != 0 &&
1054 strmatch2( tld, "my" ) != 0 &&
1055 strmatch2( tld, "mz" ) != 0 &&
1056 strmatch2( tld, "mz" ) != 0 &&
1057 strmatch2( tld, "na" ) != 0 &&
1058 strmatch4( tld, "name" ) != 0 &&
1059 strmatch2( tld, "nc" ) != 0 &&
1060 strmatch2( tld, "ne" ) != 0 &&
1061 strmatch3( tld, "net" ) != 0 &&
1062 strmatch7( tld, "network" ) != 0 &&
1063 strmatch2( tld, "nf" ) != 0 &&
1064 strmatch2( tld, "ng" ) != 0 &&
1065 strmatch3( tld, "ngo" ) != 0 &&
1066 strmatch2( tld, "ni" ) != 0 &&
1067 strmatch2( tld, "ni" ) != 0 &&
1068 strmatch5( tld, "ninja" ) != 0 &&
1069 strmatch2( tld, "nl" ) != 0 &&
1070 strmatch2( tld, "no" ) != 0 &&
1071 strmatch2( tld, "np" ) != 0 &&
1072 strmatch2( tld, "nr" ) != 0 &&
1073 strmatch3( tld, "nrw" ) != 0 &&
1074 strmatch2( tld, "nu" ) != 0 &&
1075 strmatch2( tld, "nu" ) != 0 &&
1076 strmatch2( tld, "nz" ) != 0 &&
1077 strmatch2( tld, "om" ) != 0 &&
1078 strmatch3( tld, "ong" ) != 0 &&
1079 strmatch3( tld, "ooo" ) != 0 &&
1080 strmatch3( tld, "org" ) != 0 &&
1081 strmatch3( tld, "ovh" ) != 0 &&
1082 strmatch2( tld, "pa" ) != 0 &&
1083 strmatch5( tld, "paris" ) != 0 &&
1084 strmatch5( tld, "party" ) != 0 &&
1085 strmatch2( tld, "pe" ) != 0 &&
1086 strmatch2( tld, "pf" ) != 0 &&
1087 strmatch2( tld, "pg" ) != 0 &&
1088 strmatch2( tld, "ph" ) != 0 &&
1089 strmatch8( tld, "pharmacy" ) != 0 &&
1090 strmatch5( tld, "photo" ) != 0 &&
1091 strmatch6( tld, "photos" ) != 0 &&
1092 strmatch4( tld, "pics" ) != 0 &&
1093 strmatch4( tld, "pink" ) != 0 &&
1094 strmatch2( tld, "pk" ) != 0 &&
1095 strmatch2( tld, "pl" ) != 0 &&
1096 strmatch2( tld, "pm" ) != 0 &&
1097 strmatch2( tld, "pn" ) != 0 &&
1098 strmatch5( tld, "poker" ) != 0 &&
1099 strmatch4( tld, "post" ) != 0 &&
1100 strmatch2( tld, "pr" ) != 0 &&
1101 strmatch5( tld, "press" ) != 0 &&
1102 strmatch3( tld, "pro" ) != 0 &&
1103 strmatch2( tld, "ps" ) != 0 &&
1104 strmatch2( tld, "pt" ) != 0 &&
1105 strmatch3( tld, "pub" ) != 0 &&
1106 strmatch2( tld, "pw" ) != 0 &&
1107 strmatch2( tld, "py" ) != 0 &&
1108 strmatch2( tld, "py" ) != 0 &&
1109 strmatch2( tld, "qa" ) != 0 &&
1110 strmatch2( tld, "re" ) != 0 &&
1111 strmatch3( tld, "red" ) != 0 &&
1112 strmatch7( tld, "reviews" ) != 0 &&
1113 strmatch10( tld, "restaurant" ) != 0 &&
1114 strmatch2( tld, "ro" ) != 0 &&
1115 strmatch5( tld, "rocks" ) != 0 &&
1116 strmatch2( tld, "rs" ) != 0 &&
1117 strmatch2( tld, "ru" ) != 0 &&
1118 strmatch4( tld, "ruhr" ) != 0 &&
1119 strmatch2( tld, "rw" ) != 0 &&
1120 strmatch2( tld, "sa" ) != 0 &&
1121 strmatch2( tld, "sb" ) != 0 &&
1122 strmatch2( tld, "sc" ) != 0 &&
1123 strmatch7( tld, "science" ) != 0 &&
1124 strmatch6( tld, "school" ) != 0 &&
1125 strmatch4( tld, "scot" ) != 0 &&
1126 strmatch2( tld, "sd" ) != 0 &&
1127 strmatch2( tld, "se" ) != 0 &&
1128 strmatch4( tld, "sexy" ) != 0 &&
1129 strmatch2( tld, "sg" ) != 0 &&
1130 strmatch2( tld, "sh" ) != 0 &&
1131 strmatch2( tld, "si" ) != 0 &&
1132 strmatch7( tld, "singles" ) != 0 &&
1133 strmatch2( tld, "sj" ) != 0 &&
1134 strmatch2( tld, "sk" ) != 0 &&
1135 strmatch3( tld, "ski" ) != 0 &&
1136 strmatch2( tld, "sl" ) != 0 &&
1137 strmatch2( tld, "sm" ) != 0 &&
1138 strmatch2( tld, "sn" ) != 0 &&
1139 strmatch2( tld, "so" ) != 0 &&
1140 strmatch6( tld, "social" ) != 0 &&
1141 strmatch5( tld, "sport" ) != 0 &&
1142 strmatch2( tld, "sr" ) != 0 &&
1143 strmatch2( tld, "st" ) != 0 &&
1144 strmatch2( tld, "su" ) != 0 &&
1145 strmatch2( tld, "sv" ) != 0 &&
1146 strmatch2( tld, "sx" ) != 0 &&
1147 strmatch2( tld, "sy" ) != 0 &&
1148 strmatch7( tld, "systems" ) != 0 &&
1149 strmatch2( tld, "sz" ) != 0 &&
1150 strmatch5( tld, "tatar" ) != 0 &&
1151 strmatch2( tld, "tc" ) != 0 &&
1152 strmatch2( tld, "td" ) != 0 &&
1153 strmatch4( tld, "tech" ) != 0 &&
1154 strmatch10( tld, "technology" ) != 0 &&
1155 strmatch3( tld, "tel" ) != 0 &&
1156 strmatch2( tld, "tf" ) != 0 &&
1157 strmatch2( tld, "tg" ) != 0 &&
1158 strmatch2( tld, "th" ) != 0 &&
1159 strmatch4( tld, "tips" ) != 0 &&
1160 strmatch5( tld, "tirol" ) != 0 &&
1161 strmatch2( tld, "tj" ) != 0 &&
1162 strmatch2( tld, "tk" ) != 0 &&
1163 strmatch2( tld, "tl" ) != 0 &&
1164 strmatch2( tld, "tm" ) != 0 &&
1165 strmatch2( tld, "tn" ) != 0 &&
1166 strmatch2( tld, "to" ) != 0 &&
1167 strmatch5( tld, "today" ) != 0 &&
1168 strmatch3( tld, "top" ) != 0 &&
1169 strmatch2( tld, "tp" ) != 0 &&
1170 strmatch2( tld, "tr" ) != 0 &&
1171 strmatch8( tld, "training" ) != 0 &&
1172 strmatch6( tld, "travel" ) != 0 &&
1173 strmatch2( tld, "tt" ) != 0 &&
1174 strmatch2( tld, "tv" ) != 0 &&
1175 strmatch2( tld, "tw" ) != 0 &&
1176 strmatch2( tld, "tz" ) != 0 &&
1177 strmatch2( tld, "ua" ) != 0 &&
1178 strmatch2( tld, "ug" ) != 0 &&
1179 strmatch2( tld, "uk" ) != 0 &&
1180 strmatch2( tld, "um" ) != 0 &&
1181 strmatch3( tld, "uno" ) != 0 &&
1182 strmatch2( tld, "us" ) != 0 &&
1183 strmatch2( tld, "uy" ) != 0 &&
1184 strmatch2( tld, "uz" ) != 0 &&
1185 strmatch2( tld, "va" ) != 0 &&
1186 strmatch2( tld, "vc" ) != 0 &&
1187 strmatch2( tld, "ve" ) != 0 &&
1188 strmatch5( tld, "vegas" ) != 0 &&
1189 strmatch12( tld, "versicherung" ) != 0 &&
1190 strmatch2( tld, "vg" ) != 0 &&
1191 strmatch2( tld, "vi" ) != 0 &&
1192 strmatch10( tld, "vlaanderen" ) != 0 &&
1193 strmatch2( tld, "vn" ) != 0 &&
1194 strmatch6( tld, "voyage" ) != 0 &&
1195 strmatch2( tld, "vu" ) != 0 &&
1196 strmatch4( tld, "wang" ) != 0 &&
1197 strmatch5( tld, "wales" ) != 0 &&
1198 strmatch6( tld, "webcam" ) != 0 &&
1199 strmatch7( tld, "website" ) != 0 &&
1200 strmatch2( tld, "wf" ) != 0 &&
1201 strmatch7( tld, "whoswho" ) != 0 &&
1202 strmatch4( tld, "wifi" ) != 0 &&
1203 strmatch4( tld, "wiki" ) != 0 &&
1204 strmatch4( tld, "work" ) != 0 &&
1205 strmatch5( tld, "world" ) != 0 &&
1206 strmatch4( tld, "wpad" ) != 0 &&
1207 strmatch2( tld, "ws" ) != 0 &&
1208 strmatch3( tld, "wtf" ) != 0 &&
1209 strmatch4( tld, "xn--" ) != 0 && /* e.g. xn--p1ai */
1210 strmatch3( tld, "xxx" ) != 0 &&
1211 strmatch3( tld, "xyz" ) != 0 &&
1212 strmatch2( tld, "ye" ) != 0 &&
1213 strmatch2( tld, "yt" ) != 0 &&
1214 strmatch2( tld, "yu" ) != 0 &&
1215 strmatch2( tld, "za" ) != 0 &&
1216 strmatch2( tld, "zm" ) != 0 &&
1217 strmatch4( tld, "zone" ) != 0 &&
1218 strmatch2( tld, "zw" ) != 0
1219 )
1220 {
1221 if (!doWarnings)
1222 fprintf( stderr, "warning: possibly incorrect domain name: %s\n", url );
1223 retval = 1;
1224 }
1225
1226 /* allowed characters:
1227 * 0 - 9
1228 * A - Z a - z
1229 * [ ] :
1230 * - . %
1231 * illegal but common: _
1232 * URL may start with "|." which is interpreted as "no subdomain".
1233 * RFC3986 is superceded by RFC5890-5895
1234 * According to the new RFCs labels can contain UTF characters
1235 */
1236 s = (unsigned char *) url;
1237 if (*s == '|' && *(s+1) == '.')
1238 s += 2;
1239 for (; *s != '\0'; s++)
1240 {
1241 if (!( (*s >= 'a' && *s <= 'z') ||
1242 (*s >= 'A' && *s <= 'Z') ||
1243 (*s >= '0' && *s <= '9') ||
1244 (*s == '.' || *s == '-' || *s == ':' || *s == '[' || *s == ']' || *s == '%') ||
1245 (*s == '_') || *s >= 0x80 ))
1246 {
1247 fprintf( stderr, "error: domainname '%s' has illegal character '%c'\n", url, *s );
1248 retval = 0;
1249 break;
1250 }
1251 }
1252
1253 if (first_slash != NULL)
1254 *first_slash = '/';
1255
1256 return retval;
1257 }
1258
1259
initTable(char * tableName)1260 void initTable( char * tableName )
1261 {
1262 table = (struct UFDBgentableNode *) malloc( sizeof( struct UFDBgentableNode ) );
1263 table->tag = (unsigned char *) _STRDUP( tableName );
1264 table->totalSubNodes = 0;
1265 table->nSTA = 0;
1266 table->stas = NULL;
1267 numNodes++;
1268
1269 numIndexNodes = 0;
1270 }
1271
1272
1273 #if 0
1274 static UFDB_GCC_MALLOC_ATTR UFDB_GCC_INLINE void * _trealloc( void * p, int n )
1275 {
1276 int nup;
1277
1278 if (n == 2)
1279 return realloc( p, ROUNDUP(2) * sizeof(struct UFDBgentableNode) );
1280
1281 if (n < ROUNDUPBYCUTOFF)
1282 {
1283 nup = ROUNDUP(n);
1284 if (nup == ROUNDUP(n-1))
1285 return p;
1286 }
1287 else
1288 {
1289 nup = BIGROUNDUP(n);
1290 if (nup == BIGROUNDUP(n-1))
1291 return p;
1292 }
1293
1294 return realloc( p, nup * sizeof(struct UFDBgentableNode) );
1295 }
1296 #endif
1297
1298
1299 #if UFDB_OVERRIDE_GCC_OPT && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
1300 #pragma GCC push_options
1301 #pragma GCC optimize ("O3")
1302 /* do not use loop unrolling since it is slower */
1303 #endif
1304
1305
1306 /*
1307 * UFDBinsertURL: insert revURL into table t.
1308 * return values: 1 if revURL already exists, 0 if revURL was inserted.
1309 */
1310 UFDB_GCC_HOT
UFDBinsertURL(struct UFDBgentableNode * node,UFDBrevURL * revURL,char * origURL,UFDBurlType type)1311 static int UFDBinsertURL(
1312 struct UFDBgentableNode * node,
1313 UFDBrevURL * revURL,
1314 char * origURL,
1315 UFDBurlType type )
1316 {
1317 /*
1318 * find the index where our URL has to be inserted before or is equal to
1319 * e.g. the level "net" is either "< nl" or "= net".
1320 */
1321 int cmp;
1322 unsigned int sta_i, i, j;
1323 int b, e;
1324 int rv;
1325 int tailInsert;
1326 struct UFDBgentableSTA * sta;
1327
1328 tailInsert = rv = 0;
1329 cmp = 0;
1330 i = 0;
1331 sta_i = -1;
1332
1333 newlevel:
1334 DEBUG(( stderr, "newlevel: UFDBinsertURL( %p, %p, %d ) nodetag '%s' urlpart '%s'\n",
1335 (void*) node, (void*) revURL, type,
1336 node != NULL ? (char *) node->tag : "NULL",
1337 revURL != NULL ? (char*) revURL->part : "NULL" ));
1338
1339 if (revURL == NULL)
1340 {
1341 if (node != NULL)
1342 {
1343 DEBUG(( stderr, " revURL=NULL node: totalSubNodes=%d, nSTA=%d tag='%s'\n",
1344 node->totalSubNodes, node->nSTA, node->tag ));
1345 if (node->totalSubNodes > 0 && node->tag[0] != '/')
1346 {
1347 /* interesting... we are trying to insert "xxx.com" while the tree already
1348 * has one or more members with subnodes like sss.xxx.com or xxx.com/foo[?a=1].
1349 * Lets optimise this and get rid of the subdomains !
1350 */
1351 DEBUG(( stderr, " inserted URL has subdomains, first subdomain/path is '%s'\n",
1352 node->stas->subNodes[0].tag ));
1353 if (ufdbGV.debug)
1354 ufdbLogMessage( "inserted URL (%s) part has subnodes, first %s is '%s' urlpart NULL "
1355 "nodetag '%s' removing subnodes",
1356 origURL,
1357 type==UFDBdomain ? "subdomain" : "path/parameter",
1358 node->stas->subNodes[0].tag, node->tag );
1359 if (!skipOptimisations)
1360 {
1361 DEBUG(( stderr, " removing subdomains of node '%s'\n", node->tag ));
1362 rv = 1;
1363 node->totalSubNodes = 0;
1364 node->nSTA = 0;
1365 free( node->stas ); /* TO-DO: should free() a tree ! */
1366 node->stas = NULL;
1367 }
1368 }
1369 }
1370 else
1371 {
1372 DEBUG(( stderr, " revURL=NULL node=NULL\n" ));
1373 }
1374 return rv;
1375 }
1376
1377 /* there is a 2-level insert:
1378 * first find the appropriate STA for the insert and then insert the revURL into nodes[].
1379 * If the STA is full, split it into 2 STAs.
1380 *
1381 * If the input file is already (mostly) sorted, almost all insertions take place at the end.
1382 * So lets optimise this by first looking at the end before doing a binary search.
1383 *
1384 * We may have to delete URLs if we are inserting a URL with a shorter path and
1385 * URLs with a longer path (and same path as the one being inserted) exists.
1386 */
1387
1388 if (node->nSTA == 0) /* the very first entry at this level */
1389 {
1390 DEBUG(( stderr, " UFDBinsertURL nSTA=0 creating first STA for node '%s' and subNode '%s'\n",
1391 node->tag, (revURL==NULL ? (unsigned char *)"NULL" : revURL->part) ));
1392
1393 numNodes++;
1394 node->totalSubNodes = 1;
1395 node->nSTA = 1;
1396 node->stas = (struct UFDBgentableSTA *) malloc( 1 * sizeof(struct UFDBgentableSTA) );
1397 sta = node->stas;
1398 sta->nSubNodes = 1;
1399 sta->nodeArraySize = MinNodesPerSTA;
1400 sta->subNodes = (struct UFDBgentableNode *) malloc( MinNodesPerSTA * sizeof(struct UFDBgentableNode) );
1401 sta->subNodes->totalSubNodes = 0;
1402 sta->subNodes->nSTA = 0;
1403 sta->subNodes->stas = NULL;
1404 sta->subNodes->tag = (unsigned char *) _STRDUP( (char *) revURL->part );
1405
1406 node = &( sta->subNodes[0] );
1407 revURL = revURL->next;
1408 DEBUG(( stderr, " UFDBinsertURL going to newlevel\n" ));
1409 goto newlevel;
1410 }
1411
1412 /* check for tail insert: compare with last element of last STA */
1413 sta_i = node->nSTA - 1;
1414 sta = &( node->stas[sta_i] );
1415 i = sta->nSubNodes - 1;
1416 DEBUG(( stderr, " going to insert '%s' and last node of the last STA(0x%08lx:%d:%d) is '%s'\n",
1417 (char *) revURL->part, (long) sta, sta_i, sta->nSubNodes, (char *) sta->subNodes[i].tag ));
1418
1419 cmp = strcmpURLpart( (char*) revURL->part, (char*) sta->subNodes[i].tag );
1420 DEBUG(( stderr, " strcmpURLpart( %s, %s ) = %d\n", (char*) revURL->part,
1421 (char*) sta->subNodes[i].tag, cmp ));
1422 if (cmp == 0)
1423 {
1424 if (revURL->part[0] == '/' && sta->subNodes[i].totalSubNodes > 0)
1425 {
1426 cmp = strcmp( (char*) revURL->part, (char*) sta->subNodes[i].tag );
1427 DEBUG(( stderr, " nodetag==urlpart and PATH and node tag has children. new cmp %d\n", cmp ));
1428 }
1429 else
1430 {
1431 /* node tag matches url tag; there will be no insert at this level */
1432 DEBUG(( stderr, " nodetag==urlpart: do not insert; goto node_match\n" ));
1433 goto node_match;
1434 }
1435 }
1436 if (cmp > 0) /* it is a tail insert; we are done with the search for STA and STA index */
1437 {
1438 tailInsert = 1;
1439 goto sta_found;
1440 }
1441
1442 /* there is no tail insert, so start with finding the right STA */
1443 e = node->nSTA - 1;
1444 if (e == 0)
1445 goto sta_found; /* there is only one STA; sta and sta_i are already set */
1446 b = 0;
1447 while (b <= e)
1448 {
1449 sta_i = (b + e) / 2;
1450 sta = &( node->stas[sta_i] );
1451 i = 0;
1452 cmp = strcmpURLpart( (char *) revURL->part, (char *) sta->subNodes[i].tag );
1453 if (cmp < 0)
1454 e = sta_i - 1;
1455 else if (cmp == 0)
1456 goto node_match;
1457 else /* cmp>0 */
1458 {
1459 i = sta->nSubNodes - 1;
1460 cmp = strcmpURLpart( (char *) revURL->part, (char *) sta->subNodes[i].tag );
1461 if (cmp == 0)
1462 goto node_match;
1463 if (cmp < 0)
1464 break;
1465 b = sta_i + 1;
1466 }
1467 }
1468 sta_found:
1469 DEBUG(( stderr, " inserting in STA sta_i=%d sta=0x%08lx\n", sta_i, (long) sta ));
1470
1471 /* The STA is found, check if an enlargement or a split is necessary */
1472 if (sta->nSubNodes == MaxNodesPerSTA)
1473 {
1474 struct UFDBgentableSTA * newsta;
1475
1476 /* split an STA into two STAs with both MaxNodesPerSTA subNodes.
1477 * The existing subNodes are divided 50/50 amongst the current STA and the new STA
1478 * EXCEPT when the current STA is the last STA due to very frequent tail insertion.
1479 */
1480 newsta = (struct UFDBgentableSTA *) malloc( sizeof(struct UFDBgentableSTA) );
1481 newsta->nodeArraySize = MaxNodesPerSTA;
1482 newsta->subNodes = (struct UFDBgentableNode *) malloc( MaxNodesPerSTA * sizeof(struct UFDBgentableNode) );
1483 if (sta_i == node->nSTA - 1) /* is the STA the last one of this node ? */
1484 {
1485 DEBUG(( stderr, "++ splitting last STA sta_i=%d\n", sta_i ));
1486 /* only transfer one subNode to the new STA */
1487 newsta->nSubNodes = 1;
1488 newsta->subNodes[0] = sta->subNodes[MaxNodesPerSTA-1];
1489 sta->nSubNodes--;
1490 /* insert the new STA in the node (at the end) */
1491 node->nSTA++;
1492 node->stas = (struct UFDBgentableSTA *)
1493 realloc( node->stas, node->nSTA * sizeof(struct UFDBgentableSTA) );
1494 node->stas[node->nSTA-1] = *newsta;
1495 /* must reassign sta because node->stas is realloced */
1496 sta = &node->stas[sta_i];
1497 newsta = &node->stas[node->nSTA-1];
1498 }
1499 else /* not the last STA of this node */
1500 {
1501 DEBUG(( stderr, "++ splitting STA sta_i=%d\n", sta_i ));
1502 /* divide subNodes 50/50 */
1503 newsta->nSubNodes = MaxNodesPerSTA/2;
1504 for (i = 0; i < MaxNodesPerSTA/2; i++)
1505 {
1506 newsta->subNodes[i] = sta->subNodes[i + MaxNodesPerSTA/2];
1507 }
1508 sta->nSubNodes -= MaxNodesPerSTA/2;
1509 /* insert the new STA in the node (not at the end) */
1510 node->nSTA++;
1511 node->stas = (struct UFDBgentableSTA *)
1512 realloc( node->stas, node->nSTA * sizeof(struct UFDBgentableSTA) );
1513 for (j = node->nSTA-1; j > sta_i+1; j--)
1514 node->stas[j] = node->stas[j-1];
1515 node->stas[sta_i+1] = *newsta;
1516 /* must reassign sta because node->stas is realloced */
1517 sta = &node->stas[sta_i];
1518 newsta = &node->stas[sta_i+1];
1519 }
1520
1521 /* determine if sta has to point to the new STA */
1522 i = 0;
1523 cmp = strcmpURLpart( (char *) revURL->part, (char *) newsta->subNodes[i].tag );
1524 if (cmp >= 0)
1525 sta = newsta;
1526 if (cmp == 0)
1527 goto node_match;
1528 }
1529 else if (sta->nSubNodes >= sta->nodeArraySize)
1530 {
1531 /* extend STA */
1532 unsigned int newSize;
1533 if (sta->nodeArraySize == 0)
1534 newSize = 1;
1535 else if (sta->nodeArraySize == 1)
1536 newSize = 8;
1537 else if (sta->nodeArraySize == 8)
1538 newSize = 256;
1539 else
1540 newSize = MaxNodesPerSTA;
1541 DEBUG(( stderr, "++ sta->nSubNodes is %d, extending subNodes array from %d to %d elements\n",
1542 sta->nSubNodes, sta->nodeArraySize, newSize ));
1543 sta->nodeArraySize = newSize;
1544 sta->subNodes = (struct UFDBgentableNode *)
1545 realloc( sta->subNodes, newSize * sizeof(struct UFDBgentableNode) );
1546 }
1547
1548 /* the STA is found, subNodes is guaranteed to have space for one more element; search now inside this STA */
1549 if (tailInsert)
1550 {
1551 i = sta->nSubNodes - 1;
1552 cmp = 1;
1553 }
1554 else
1555 {
1556 b = 0;
1557 e = sta->nSubNodes - 1;
1558 DEBUG(( stderr, " starting bsearch in STA: b 0 e %d\n", e ));
1559 while (b <= e) /* binary search STA */
1560 {
1561 i = (b + e) / 2;
1562 cmp = strcmpURLpart( (char *) revURL->part, (char *) sta->subNodes[i].tag );
1563 if (cmp < 0)
1564 e = i - 1;
1565 else if (cmp > 0)
1566 b = i + 1;
1567 else
1568 goto node_match;
1569 }
1570 DEBUG(( stderr, " NOTFOUND after bsearch in STA: part '%s' cmp %d i %d b %d e %d "
1571 "totalSubNodes %d\n",
1572 (revURL==NULL ? (unsigned char *)"NULL" : revURL->part), cmp, i, b, e, node->totalSubNodes ));
1573 }
1574
1575 if (cmp < 0 && revURL != NULL && revURL->part[0] == '/')
1576 {
1577 size_t partlen = strlen( (char*) revURL->part );
1578
1579 if (strncmp( (char*) revURL->part, (char*) sta->subNodes[i].tag, partlen ) == 0 && revURL->next == NULL)
1580 {
1581 if (ufdbGV.debug)
1582 ufdbLogMessage( "inserted URL (%s) part has subnodes, first %s is '%s' part NULL nodetag '%s'",
1583 origURL,
1584 type==UFDBdomain ? "subdomain" : "path/parameter",
1585 node->stas->subNodes[0].tag, node->tag );
1586 if (doWarnings)
1587 ufdbLogMessage( "URL with longer path is already in the table:\n"
1588 " inserting URL %s\n"
1589 " removing URL with path %s",
1590 origURL, sta->subNodes[i].tag );
1591 node = &sta->subNodes[i];
1592 node->tag[partlen] = '\0'; /* truncate the longer path effectively inserting the shorter path */
1593 if (node->totalSubNodes > 0)
1594 {
1595 /* the longer path most likely has no children but remove them if they exist */
1596 node->totalSubNodes = 0;
1597 node->nSTA = 0;
1598 free( node->stas ); /* TO-DO: should free() a tree ! */
1599 node->stas = NULL;
1600 }
1601
1602 /* remove other longer paths */
1603 i++;
1604 while (i < sta->nSubNodes &&
1605 strncmp( (char*) revURL->part, (char*) sta->subNodes[i].tag, partlen ) == 0)
1606 {
1607 if (doWarnings)
1608 ufdbLogMessage( " also removing URL with path %s", sta->subNodes[i].tag );
1609 for (j = i + 1; j < sta->nSubNodes; j++)
1610 sta->subNodes[j-1] = sta->subNodes[j];
1611 sta->nSubNodes--;
1612 }
1613
1614 /* XXX TODO: remove longer paths in the next STA */
1615
1616 /* abort the insertion */
1617 return 1;
1618 }
1619 }
1620
1621 node->totalSubNodes++;
1622 DEBUG(( stderr, " totalsubnodes of \"%s\" incremented to %d\n", node->tag, node->totalSubNodes ));
1623 numNodes++;
1624
1625 /* implemented optimisations:
1626 * do not add subdom.abc.com/aurl if abc.com is already in the tree
1627 * do not add subdom.abc.com if abc.com is already in the tree
1628 * remove subdom.abc.com from tree if abc.com is being inserted
1629 */
1630
1631 doinsert:
1632 if (cmp > 0) /* this entry > subNodes[i] */
1633 {
1634 DEBUG(( stderr, " cmp>0 after bsearch: urlpart '%s' nodetag '%s' i %d nSubNodes %d shifting %d\n",
1635 (revURL==NULL ? (unsigned char *)"NULL" : revURL->part),
1636 sta->subNodes[i].tag, i, sta->nSubNodes, sta->nSubNodes - 1 - i ));
1637 DEBUG(( stderr, " insert '%s' after '%s'\n", revURL->part, sta->subNodes[i].tag ));
1638
1639 i++;
1640
1641 sta->nSubNodes++;
1642
1643 /* make space in the array */
1644 for (j = sta->nSubNodes - 1; j > i; j--)
1645 sta->subNodes[j] = sta->subNodes[j-1];
1646
1647 /* insert the current revURL into the array */
1648 sta->subNodes[i].totalSubNodes = 0;
1649 sta->subNodes[i].nSTA = 0;
1650 sta->subNodes[i].stas = NULL;
1651 sta->subNodes[i].tag = (unsigned char *) _STRDUP( (char *) revURL->part );
1652
1653 /* process the tail of revURL */
1654 node = &( sta->subNodes[i] );
1655 revURL = revURL->next;
1656 goto newlevel;
1657 }
1658 else if (cmp < 0) /* this entry < subNodes[i] */
1659 {
1660 DEBUG(( stderr, " cmp<0 after bsearch: urlpart '%s' nodetag '%s' i %d nSubNodes %d shifting %d\n",
1661 (revURL==NULL ? (unsigned char *)"NULL" : revURL->part),
1662 sta->subNodes[i].tag, i, sta->nSubNodes, sta->nSubNodes - 1 - i ));
1663 DEBUG(( stderr, " insert '%s' before '%s'\n", revURL->part, sta->subNodes[i].tag ));
1664
1665 sta->nSubNodes++;
1666
1667 /* make space in the array */
1668 for (j = sta->nSubNodes - 1; j > i; j--)
1669 sta->subNodes[j] = sta->subNodes[j-1];
1670
1671 /* insert the current revURL into the array */
1672 sta->subNodes[i].totalSubNodes = 0;
1673 sta->subNodes[i].nSTA = 0;
1674 sta->subNodes[i].stas = NULL;
1675 sta->subNodes[i].tag = (unsigned char *) _STRDUP( (char *) revURL->part );
1676
1677 /* process the tail of revURL */
1678 node = &( sta->subNodes[i] );
1679 revURL = revURL->next;
1680 goto newlevel;
1681 }
1682 else if (cmp == 0) /* an exact match at this level */
1683 {
1684 node_match:
1685 DEBUG(( stderr, " cmp==0 after bsearch: urlpart '%s' nodetag '%s' node.totalSubNodes %u "
1686 "sta->nSubNodes %u i %d\n",
1687 revURL->part, sta->subNodes[i].tag, sta->subNodes[i].totalSubNodes, sta->nSubNodes, i ));
1688
1689 // we must deal with an exception here: if the table has example.com/foobar and we are
1690 // inserting example.com/foo?p=v then cmp==0 is wrong and we must correct it here.
1691 if (revURL->part[0] == '/' && revURL->next != NULL)
1692 {
1693 int cmp2 = strcmp( (char*) revURL->part, (char*) sta->subNodes[i].tag );
1694 if (cmp2 != 0)
1695 {
1696 DEBUG(( stderr, " correction for path %s with parameters: cmp must be %d\n",
1697 sta->subNodes[i].tag, cmp2 ));
1698 cmp = cmp2;
1699 goto doinsert;
1700 }
1701 }
1702
1703 /* do not add revURL example.com/foo if example.com is in the table */
1704 if (type == UFDBurl)
1705 {
1706 if (skipOptimisations || sta->subNodes[i].totalSubNodes != 0)
1707 {
1708 node = &( sta->subNodes[i] );
1709 revURL = revURL->next;
1710 goto newlevel;
1711 }
1712 }
1713 else
1714 {
1715 node = &( sta->subNodes[i] );
1716 revURL = revURL->next;
1717 DEBUG(( stderr, " node match: going to newlevel\n" ));
1718 goto newlevel;
1719 }
1720 }
1721
1722 return rv;
1723 }
1724
1725 #if UFDB_OVERRIDE_GCC_OPT && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
1726 #pragma GCC pop_options
1727 #endif
1728
1729
1730 /* generate a binary table file, database table format 2.0
1731 */
createMemTable_2_0(struct UFDBgentableNode * node)1732 void createMemTable_2_0( struct UFDBgentableNode * node )
1733 {
1734 unsigned int i, j;
1735 struct UFDBgentableSTA * sta;
1736
1737 DEBUG(( stderr, "tag=%s totalSubNodes=%d #STAs=%d\n", node->tag, node->totalSubNodes, node->nSTA ));
1738 mem_puts( node->tag );
1739
1740 if (node->totalSubNodes > 0)
1741 {
1742 DEBUG(( stderr, "sublevel " ));
1743 mem_putc( UFDBsubLevel );
1744
1745 /* write the number of subnodes in a 1-byte or 4-byte code */
1746 if (node->totalSubNodes <= 255)
1747 {
1748 DEBUG(( stderr, "#1byte-subNodes=%d\n", node->totalSubNodes ));
1749 mem_putc( node->totalSubNodes );
1750 numSub8plus++;
1751 }
1752 else
1753 {
1754 DEBUG(( stderr, "#4byte-subNodes=%d\n", node->totalSubNodes ));
1755 mem_putc( 0 );
1756 i = node->totalSubNodes;
1757 if (i >= 256*256)
1758 numSub64Kplus++;
1759 else
1760 numSub255plus++;
1761 mem_putc( i % 256 );
1762 i = i / 256;
1763 mem_putc( i % 256 );
1764 i = i / 256;
1765 mem_putc( i % 256 );
1766 if (i > 32 && doWarnings)
1767 fprintf( stderr, "Warning: LARGE number of subnodes: %d for tag %s\n", node->totalSubNodes, node->tag );
1768 }
1769 DEBUG(( stderr, " tag = %-18s sub-level %d subnode(s)\n", node->tag, node->totalSubNodes ));
1770 if (ufdbGV.debug > 1)
1771 {
1772 ufdbLogMessage( "node '%s' has %d subnodes in %d STAs", node->tag, node->totalSubNodes, node->nSTA );
1773 if (node->nSTA > 1)
1774 for (i = 0; i < node->nSTA; i++)
1775 ufdbLogMessage( " STA %d has %d subnodes", i, node->stas[i].nSubNodes );
1776 }
1777 }
1778 else
1779 {
1780 numLeafNodes++;
1781 DEBUG(( stderr, " tag = %-18s leaf (no-next-level)\n", node->tag ));
1782 }
1783
1784 for (i = 0; i < node->nSTA; i++)
1785 {
1786 sta = &node->stas[i];
1787 for (j = 0; j < sta->nSubNodes; j++)
1788 {
1789 DEBUG(( stderr, "recursive-call STA %d subnode %d\n", i, j ));
1790 createMemTable_2_0( &sta->subNodes[j] );
1791
1792 if (sta->subNodes[j].totalSubNodes == 0)
1793 {
1794 if (i != node->nSTA - 1 || j != sta->nSubNodes - 1)
1795 {
1796 DEBUG(( stderr, "samelevel\n" ));
1797 mem_putc( UFDBsameLevel );
1798 }
1799 }
1800 else
1801 {
1802 DEBUG(( stderr, "prevlevel\n" ));
1803 mem_putc( UFDBprevLevel );
1804 }
1805 }
1806 }
1807 }
1808
1809
1810 /* generate a binary table file, database table format 2.1
1811 */
1812 UFDB_GCC_HOT
createMemTable_2_1(struct UFDBgentableNode * node)1813 void createMemTable_2_1( struct UFDBgentableNode * node )
1814 {
1815 unsigned int i, j;
1816 static int indent = 0;
1817 struct UFDBgentableSTA * sta;
1818
1819 if (doPadding)
1820 mem_pad();
1821 mem_puts( node->tag );
1822
1823 if (node->totalSubNodes == 1)
1824 {
1825 mem_putc( UFDBsubLevel1 );
1826 numSub1++;
1827 DEBUG(( stderr, "%*s tag = %-18s sub-level 1 subnode\n", indent, "", node->tag ));
1828 }
1829 else if (node->totalSubNodes == 2)
1830 {
1831 mem_putc( UFDBsubLevel2 );
1832 numSub2++;
1833 DEBUG(( stderr, "%*s tag = %-18s sub-level 2 subnodes\n", indent, "", node->tag ));
1834 }
1835 else if (node->totalSubNodes == 3)
1836 {
1837 mem_putc( UFDBsubLevel3 );
1838 numSub3++;
1839 DEBUG(( stderr, "%*s tag = %-18s sub-level 3 subnodes\n", indent, "", node->tag ));
1840 }
1841 else if (node->totalSubNodes == 4)
1842 {
1843 mem_putc( UFDBsubLevel4 );
1844 numSub4++;
1845 DEBUG(( stderr, "%*s tag = %-18s sub-level 4 subnodes\n", indent, "", node->tag ));
1846 }
1847 else if (node->totalSubNodes == 5)
1848 {
1849 mem_putc( UFDBsubLevel5 );
1850 numSub5++;
1851 DEBUG(( stderr, "%*s tag = %-18s sub-level 5 subnodes\n", indent, "", node->tag ));
1852 }
1853 else if (node->totalSubNodes == 6)
1854 {
1855 mem_putc( UFDBsubLevel6 );
1856 numSub6++;
1857 DEBUG(( stderr, "%*s tag = %-18s sub-level 6 subnodes\n", indent, "", node->tag ));
1858 }
1859 else if (node->totalSubNodes == 7)
1860 {
1861 mem_putc( UFDBsubLevel7 );
1862 numSub7++;
1863 DEBUG(( stderr, "%*s tag = %-18s sub-level 7 subnodes\n", indent, "", node->tag ));
1864 }
1865 else if (node->totalSubNodes > 0)
1866 {
1867 /* write the number of subnodes in a 2-byte, 3-byte or 5-byte code */
1868 if (node->totalSubNodes < 256)
1869 {
1870 mem_putc( UFDBsubLevel );
1871 mem_putc( node->totalSubNodes ); /* between 8 and 255 */
1872 if (node->totalSubNodes == 8)
1873 numSub8++;
1874 else
1875 numSub8plus++;
1876 }
1877 else if (node->totalSubNodes < 256*256)
1878 {
1879 mem_putc( UFDBsubLevelNNN ); /* more than 255 and less than 65536 */
1880 i = node->totalSubNodes;
1881 mem_putc( i % 256 );
1882 i = i / 256;
1883 mem_putc( i % 256 );
1884 numSub255plus++;
1885 }
1886 else
1887 {
1888 mem_putc( UFDBsubLevelNNNNN ); /* more than 65535 */
1889 i = node->totalSubNodes;
1890 if (doWarnings && i > 15000000)
1891 fprintf( stderr, "LARGE number of subnodes: %d for node %s *****\n", i, node->tag );
1892 mem_putc( i % 256 );
1893 i = i / 256;
1894 mem_putc( i % 256 );
1895 i = i / 256;
1896 mem_putc( i % 256 );
1897 i = i / 256;
1898 mem_putc( i % 256 );
1899 numSub64Kplus++;
1900 }
1901 DEBUG(( stderr, "%*s tag = %-18s sub-level %d subnodes\n", indent, "", node->tag, node->totalSubNodes ));
1902 }
1903 else
1904 {
1905 numLeafNodes++;
1906 DEBUG(( stderr, "%*s tag = %-18s leaf node\n", indent, "", node->tag ));
1907 }
1908
1909 for (i = 0; i < node->nSTA; i++)
1910 {
1911 sta = &node->stas[i];
1912 for (j = 0; j < sta->nSubNodes; j++)
1913 {
1914 indent += 3;
1915 createMemTable_2_1( &(sta->subNodes[j]) );
1916 indent -= 3;
1917
1918 if (sta->subNodes[j].totalSubNodes == 0)
1919 {
1920 if (i != node->nSTA - 1 || j != sta->nSubNodes - 1)
1921 mem_putc( UFDBsameLevel );
1922 }
1923 else
1924 mem_putc( UFDBprevLevel );
1925 }
1926 }
1927 }
1928
1929
1930 /* generate a binary table file, database table format 2.2
1931 */
1932 UFDB_GCC_HOT
createMemTable_2_2(struct UFDBgentableNode * node)1933 void createMemTable_2_2( struct UFDBgentableNode * node )
1934 {
1935 unsigned int i, j;
1936 static int indent = 0;
1937 struct UFDBgentableSTA * sta;
1938
1939 if (doPadding)
1940 mem_pad();
1941 mem_putsnt( node->tag );
1942
1943 if (node->totalSubNodes == 1)
1944 {
1945 mem_putc( UFDBsubLevel1 );
1946 numSub1++;
1947 DEBUG(( stderr, "%*s tag = %-18s sub-level 1 subnode\n", indent, "", node->tag ));
1948 }
1949 else if (node->totalSubNodes == 2)
1950 {
1951 mem_putc( UFDBsubLevel2 );
1952 numSub2++;
1953 DEBUG(( stderr, "%*s tag = %-18s sub-level 2 subnodes\n", indent, "", node->tag ));
1954 }
1955 else if (node->totalSubNodes == 3)
1956 {
1957 mem_putc( UFDBsubLevel3 );
1958 numSub3++;
1959 DEBUG(( stderr, "%*s tag = %-18s sub-level 3 subnodes\n", indent, "", node->tag ));
1960 }
1961 else if (node->totalSubNodes == 4)
1962 {
1963 mem_putc( UFDBsubLevel4 );
1964 numSub4++;
1965 DEBUG(( stderr, "%*s tag = %-18s sub-level 4 subnodes\n", indent, "", node->tag ));
1966 }
1967 else if (node->totalSubNodes == 5)
1968 {
1969 mem_putc( UFDBsubLevel5 );
1970 numSub5++;
1971 DEBUG(( stderr, "%*s tag = %-18s sub-level 5 subnodes\n", indent, "", node->tag ));
1972 }
1973 else if (node->totalSubNodes == 6)
1974 {
1975 mem_putc( UFDBsubLevel6 );
1976 numSub6++;
1977 DEBUG(( stderr, "%*s tag = %-18s sub-level 6 subnodes\n", indent, "", node->tag ));
1978 }
1979 else if (node->totalSubNodes == 7)
1980 {
1981 mem_putc( UFDBsubLevel7 );
1982 numSub7++;
1983 DEBUG(( stderr, "%*s tag = %-18s sub-level 7 subnodes\n", indent, "", node->tag ));
1984 }
1985 else if (node->totalSubNodes > 0)
1986 {
1987 /* write the number of subnodes in a 2-byte, 3-byte or 5-byte code */
1988 if (node->totalSubNodes < 256)
1989 {
1990 mem_putc( UFDBsubLevel );
1991 mem_putc( node->totalSubNodes ); /* between 8 and 255 */
1992 if (node->totalSubNodes == 8)
1993 numSub8++;
1994 else
1995 numSub8plus++;
1996 }
1997 else if (node->totalSubNodes < 256*256)
1998 {
1999 mem_putc( UFDBsubLevelNNN ); /* more than 255 and less than 65536 */
2000 i = node->totalSubNodes;
2001 mem_putc( i % 256 );
2002 i = i / 256;
2003 mem_putc( i % 256 );
2004 numSub255plus++;
2005 }
2006 else
2007 {
2008 mem_putc( UFDBsubLevelNNNNN ); /* more than 65535 */
2009 i = node->totalSubNodes;
2010 if (doWarnings && i > 15000000)
2011 fprintf( stderr, "LARGE number of subnodes: %d for node %s *****\n", i, node->tag );
2012 mem_putc( i % 256 );
2013 i = i / 256;
2014 mem_putc( i % 256 );
2015 i = i / 256;
2016 mem_putc( i % 256 );
2017 i = i / 256;
2018 mem_putc( i % 256 );
2019 numSub64Kplus++;
2020 }
2021 DEBUG(( stderr, "%*s tag = %-18s sub-level %d subnodes\n", indent, "",
2022 node->tag, node->totalSubNodes ));
2023 }
2024 else
2025 {
2026 /* node->totalSubNodes == 0 */
2027 numLeafNodes++;
2028 DEBUG(( stderr, "%*s tag = %-18s leaf node\n", indent, "", node->tag ));
2029 }
2030
2031 for (i = 0; i < node->nSTA; i++)
2032 {
2033 sta = &node->stas[i];
2034 for (j = 0; j < sta->nSubNodes; j++)
2035 {
2036 indent += 3;
2037 createMemTable_2_2( &(sta->subNodes[j]) );
2038 indent -= 3;
2039
2040 if (sta->subNodes[j].totalSubNodes == 0)
2041 {
2042 if (i != node->nSTA - 1 || j != sta->nSubNodes - 1)
2043 mem_putc( UFDBsameLevel );
2044 }
2045 else
2046 mem_putc( UFDBprevLevel );
2047 }
2048 }
2049 }
2050
2051
2052 /* generate a binary table file, database table format 3.0
2053 */
2054 #if UFDB_DBFORMAT_3
2055 #include "genTable3.c"
2056 #endif
2057
2058
2059 /* need a forward declaration since addDomain() and addOtherYoutubeURLs() call eachother */
2060 UFDB_GCC_HOT UFDB_GCC_INLINE
2061 static void addDomain(
2062 UFDBthreadAdmin * admin,
2063 char * domain,
2064 UFDBurlType type );
2065
2066
2067 UFDB_GCC_HOT
addOtherYoutubeURLs(UFDBthreadAdmin * admin,char * id)2068 static void addOtherYoutubeURLs(
2069 UFDBthreadAdmin * admin,
2070 char * id )
2071 {
2072 char ** u;
2073 char tmpURL[128];
2074 char * otherURLs[] = {
2075 "m.youtube.com/watch?v=%s",
2076 "youtube.com/embed/%s",
2077 "youtube-nocookie.com/embed/%s",
2078 "youtube.com/get_video_info?video_id=%s",
2079 "youtube.com/get_video_info?content_v=%s",
2080 "youtube-nocookie.com/get_video_info?video_id=%s",
2081 "youtube-nocookie.com/get_video_info?content_v=%s",
2082 "ytimg.googleusercontent.com/vi/%s/",
2083 "i.ytimg.com/vi/%s/",
2084 "i1.ytimg.com/vi/%s/",
2085 "i2.ytimg.com/vi/%s/",
2086 "i3.ytimg.com/vi/%s/",
2087 "i4.ytimg.com/vi/%s/",
2088 "i5.ytimg.com/vi/%s/",
2089 "i6.ytimg.com/vi/%s/",
2090 "i7.ytimg.com/vi/%s/",
2091 "i8.ytimg.com/vi/%s/",
2092 "i9.ytimg.com/vi/%s/",
2093 "i.ytimg.com/vi_webp/%s/",
2094 "i1.ytimg.com/vi_webp/%s/",
2095 "i2.ytimg.com/vi_webp/%s/",
2096 "i3.ytimg.com/vi_webp/%s/",
2097 "i4.ytimg.com/vi_webp/%s/",
2098 "i5.ytimg.com/vi_webp/%s/",
2099 "i6.ytimg.com/vi_webp/%s/",
2100 "i7.ytimg.com/vi_webp/%s/",
2101 "i8.ytimg.com/vi_webp/%s/",
2102 "i9.ytimg.com/vi_webp/%s/",
2103 "i.ytimg.com/sb/%s/",
2104 "i1.ytimg.com/sb/%s/",
2105 "i2.ytimg.com/sb/%s/",
2106 "i3.ytimg.com/sb/%s/",
2107 "i4.ytimg.com/sb/%s/",
2108 "i5.ytimg.com/sb/%s/",
2109 "i6.ytimg.com/sb/%s/",
2110 "i7.ytimg.com/sb/%s/",
2111 "i8.ytimg.com/sb/%s/",
2112 "i9.ytimg.com/sb/%s/",
2113 NULL
2114 };
2115
2116 for (u = otherURLs; *u != NULL; u++)
2117 {
2118 sprintf( tmpURL, *u, id );
2119 addDomain( admin, tmpURL, UFDBurl );
2120 }
2121 }
2122
2123
2124 UFDB_GCC_HOT
addDomain(UFDBthreadAdmin * admin,char * domain,UFDBurlType type)2125 static void addDomain(
2126 UFDBthreadAdmin * admin,
2127 char * domain,
2128 UFDBurlType type )
2129 {
2130 char * t;
2131 UFDBrevURL * revUrl;
2132 int rv;
2133 int portnumber;
2134 char protocol[16];
2135 char strippedURL[UFDB_MAX_URL_LENGTH];
2136 char strippedDomain[1024];
2137
2138 /* strip starting and trailing whitespace */
2139 while (*domain == ' ' || *domain == '\t')
2140 domain++;
2141 for (t = domain; *t != '\0' && *t != '\n'; t++)
2142 ;
2143 t--;
2144 while (t > domain && (*t == ' ' || *t == '\t'))
2145 {
2146 *t = '\0';
2147 t--;
2148 }
2149 /* skip empty lines */
2150 if (*domain == '\0')
2151 return;
2152
2153 if (ufdbGV.debug > 1)
2154 fprintf( stderr, "addDomain( %s )\n", domain );
2155
2156 numEntries++;
2157
2158 UFDBstripURL2( (char *) domain, stripWWW, strippedURL, strippedDomain, protocol, &portnumber );
2159
2160 /* we do not check illegal domain names, but must check for '..' and dot at the start, because that
2161 * generates zero-length tags and may cause a lot of evil.
2162 */
2163 if (*strippedDomain == '.')
2164 {
2165 ufdbLogError( "domain starts with '.'; bad URL is not added: %s", domain );
2166 return;
2167 }
2168 for (t = strippedDomain; *t != '\0'; t++)
2169 {
2170 if (*t == '.' && *(t+1) == '.')
2171 {
2172 ufdbLogError( "found '..' in domain; bad URL is not added: %s", domain );
2173 return;
2174 }
2175 }
2176
2177 if (!utf8support)
2178 {
2179 unsigned char * s;
2180
2181 for (s = (unsigned char *) strippedURL; *s != '\0'; s++)
2182 {
2183 if (*s >= 0x80)
2184 {
2185 if (doWarnings)
2186 fprintf( stderr, "warning: must use format 2.2 or later for URL with UTF8 characters.\n"
2187 " skipping URL %s\n", domain );
2188 return;
2189 }
2190 }
2191 }
2192
2193 #if 0
2194 if (ufdbGV.debug)
2195 ufdbLogMessage( "domain: %s\nstrippedurl: %s\nprotocol: %s\nport: %d",
2196 domain, strippedURL, protocol, portnumber );
2197 #endif
2198
2199 revUrl = UFDBgenRevURL4table( admin, (unsigned char *) strippedURL );
2200
2201 /* first do a lookup of the domain, and if it already matches, it should not be added ! */
2202 if (skipOptimisations)
2203 rv = 0;
2204 else
2205 rv = UFDBgentableLookupRevUrl( table, revUrl );
2206 if (rv)
2207 {
2208 if (doWarnings)
2209 ufdbLogMessage( "URL %s is not added because it was already matched by a previous URL", domain );
2210 }
2211 else
2212 {
2213 rv = UFDBinsertURL( table, revUrl, domain, type );
2214 if (rv)
2215 {
2216 if (doWarnings)
2217 ufdbLogMessage( "URL %s has optimised subdomains or paths", domain );
2218 }
2219 }
2220
2221 UFDBfreeRevURL( admin, revUrl );
2222
2223 if (strncmp( strippedURL, "youtube.com/watch?v=", 20 ) == 0)
2224 {
2225 char * id;
2226 char * end;
2227 id = strippedURL + 20;
2228 if (*id != '\0' && strlen(id) > 10)
2229 {
2230 /* remove other parameters after the video ID */
2231 end = id;
2232 while (*end != '\0' && *end != '&')
2233 end++;
2234 *end = '\0';
2235 addOtherYoutubeURLs( admin, id );
2236 }
2237 }
2238 }
2239
2240
generateRandomKey(char * encryptKey)2241 static void generateRandomKey( char * encryptKey )
2242 {
2243 srandom( (getpid() << 12) + time(NULL) );
2244
2245 encryptKey[0] = randomChar();
2246 encryptKey[1] = randomChar();
2247 encryptKey[2] = randomChar();
2248 encryptKey[3] = randomChar();
2249 encryptKey[4] = randomChar();
2250 encryptKey[5] = randomChar();
2251 encryptKey[6] = randomChar();
2252 encryptKey[7] = randomChar();
2253 encryptKey[8] = randomChar();
2254 encryptKey[9] = randomChar();
2255 encryptKey[10] = randomChar();
2256 encryptKey[11] = randomChar();
2257 encryptKey[12] = randomChar();
2258 encryptKey[13] = randomChar();
2259 encryptKey[14] = randomChar();
2260 encryptKey[15] = randomChar();
2261 encryptKey[16] = '\0';
2262 }
2263
2264
copyKey(char * key,char * encryptKey)2265 static void copyKey( char * key, char * encryptKey )
2266 {
2267 key[0] = encryptKey[0];
2268 key[1] = encryptKey[1];
2269 key[2] = encryptKey[2];
2270 key[3] = encryptKey[3];
2271 key[4] = '-';
2272 key[5] = encryptKey[4];
2273 key[6] = encryptKey[5];
2274 key[7] = encryptKey[6];
2275 key[8] = encryptKey[7];
2276 key[9] = '-';
2277 key[10] = encryptKey[8];
2278 key[11] = encryptKey[9];
2279 key[12] = encryptKey[10];
2280 key[13] = encryptKey[11];
2281 key[14] = '-';
2282 key[15] = encryptKey[12];
2283 key[16] = encryptKey[13];
2284 key[17] = encryptKey[14];
2285 key[18] = encryptKey[15];
2286 key[19] = '\0';
2287 }
2288
2289
encryptMemory(unsigned char * to,unsigned char * from,long n,unsigned char * key,char * format)2290 static inline void encryptMemory( unsigned char * to, unsigned char * from, long n,
2291 unsigned char * key, char * format )
2292 {
2293 ufdbCrypt uc;
2294
2295 #if UFDB_DO_DEBUG
2296 if (ufdbGV.debug)
2297 fprintf( stderr, "encryptMemory( %p %p %'ld %16.16s )\n",
2298 (void*) to, (void*) from, n, key );
2299 #endif
2300
2301 ufdbCryptInit( &uc, key, 16, format );
2302 ufdbEncryptText( &uc, to, from, n );
2303 }
2304
2305
2306 #if UFDB_BZ2LIB_SUPPORT
BZ2compressMemory(unsigned char * to,unsigned char * from,long size)2307 static inline long BZ2compressMemory( unsigned char * to, unsigned char * from, long size )
2308 {
2309 unsigned int new_size;
2310
2311 new_size = (unsigned int) (size + 2048);
2312 if (BZ_OK != BZ2_bzBuffToBuffCompress( (char *) to, &new_size, (char *) from, size, 7, 0, 30 ))
2313 {
2314 fprintf( stderr, "compression failed.\n" );
2315 exit( 1 );
2316 }
2317
2318 if (ufdbGV.debug)
2319 fprintf( stderr, "BZIP2 compression: from size %ld to %d\n", size, new_size );
2320
2321 return new_size;
2322 }
2323 #endif
2324
2325
ZLIBcompressMemory(unsigned char * to,unsigned char * from,long size)2326 inline static long ZLIBcompressMemory( unsigned char * to, unsigned char * from, long size )
2327 {
2328 int retval;
2329 z_stream zs;
2330
2331 zs.zalloc = Z_NULL;
2332 zs.zfree = Z_NULL;
2333 zs.opaque = Z_NULL;
2334 zs.zalloc = ufdbZlibMalloc;
2335 zs.zfree = ufdbZlibFree;
2336
2337 zs.next_in = from;
2338 zs.avail_in = size;
2339
2340 zs.next_out = to;
2341 zs.avail_out = size + 2048;
2342
2343
2344 retval = deflateInit( &zs, Z_BEST_COMPRESSION );
2345 if (Z_OK != retval)
2346 {
2347 fprintf( stderr, "ZLIB initialisation failed: error %d *****\n", retval );
2348 exit( 1 );
2349 }
2350 retval = deflate( &zs, Z_FINISH );
2351 if ((Z_STREAM_END != retval && Z_OK != retval) || zs.avail_out == 0)
2352 {
2353 fprintf( stderr, "ZLIB compression failed: error %d avail_out %u *****\n", retval, zs.avail_out );
2354 exit( 1 );
2355 }
2356
2357 if (ufdbGV.debug)
2358 fprintf( stderr, "ZLIB compression: from size %ld to %lu\n", size, zs.total_out );
2359
2360 size = (long) zs.total_out;
2361 deflateEnd( &zs );
2362 return size;
2363 }
2364
2365
doCryptCompress(FILE * f,char * encryptKey,char * format)2366 static void doCryptCompress(
2367 FILE * f,
2368 char * encryptKey,
2369 char * format )
2370 {
2371 long hdr_size;
2372 long size;
2373 long orig_size;
2374 unsigned char * buffer2;
2375 #if 0
2376 int cksum = 0;
2377 #endif
2378
2379 /* The table is in mem[]; doCrypt and/or doZLIBcompress or doBZ2compress is 1.
2380 * The result of this function is in mem[] and the size (mem_i) is adjusted.
2381 */
2382
2383 orig_size = size = mem_i;
2384
2385 if (ufdbGV.debug)
2386 fprintf( stderr, "doCryptCompress orig_size %'ld bytes doCrypt=%d doZLIBcompress=%d doBZ2compress=%d "
2387 "format=%s\n",
2388 orig_size, doCrypt, doZLIBcompress, doBZ2compress, format );
2389
2390 buffer2 = (unsigned char *) malloc( size + 2048 );
2391 if (buffer2 == NULL)
2392 {
2393 fprintf( stderr, "cannot allocate memory for encryption and/or compression (size=%'ld)\n", size );
2394 exit( 1 );
2395 }
2396
2397 /* make sure the 'result' is in buffer2 */
2398 #if UFDB_BZ2LIB_SUPPORT
2399 if (doBZ2compress)
2400 {
2401 size = BZ2compressMemory( buffer2, mem, orig_size );
2402 if (ufdbGV.debug)
2403 fprintf( stderr, "BZIP2 compressed %'ld bytes to %'ld bytes in buffer2\n", orig_size, size );
2404 }
2405 else
2406 #endif
2407 if (doZLIBcompress)
2408 {
2409 size = ZLIBcompressMemory( buffer2, mem, orig_size );
2410 if (ufdbGV.debug)
2411 fprintf( stderr, "ZLIB compressed %'ld bytes to %'ld bytes in buffer2\n", orig_size, size );
2412 }
2413 else
2414 {
2415 memcpy( buffer2, mem, size );
2416 if (ufdbGV.debug)
2417 fprintf( stderr, "copied %'ld bytes to buffer2 (no compression)\n", size );
2418 }
2419
2420 /* crypt from buffer2 into mem */
2421 if (doCrypt)
2422 {
2423 encryptMemory( mem, buffer2, size, (unsigned char *) encryptKey, format );
2424 if (ufdbGV.debug)
2425 fprintf( stderr, "crypted %'ld bytes from buffer2 to mem\n", size );
2426 }
2427 else
2428 {
2429 memcpy( mem, buffer2, size );
2430 if (ufdbGV.debug)
2431 fprintf( stderr, "copied %'ld bytes from buffer2 to mem\n", size );
2432 }
2433
2434 #if 0
2435 /* TODO fix the problem with 2.1 cksum */
2436 if (strcmp( format, "2.1" ) >= 0)
2437 {
2438 cksum = UFDBcalcCksum( buffer1, size );
2439 }
2440 #endif
2441
2442 hdr_size = (doPadding || format[0] >= '3') ?
2443 sizeof(struct UFDBfileHeader21) : sizeof(struct UFDBfileHeader);
2444 /* write mem to the file */
2445 fseek( f, (long) hdr_size, SEEK_SET );
2446
2447 if (1 != fwrite( mem, size, 1, f ))
2448 {
2449 fprintf( stderr, "cannot write crypted/compressed table to file: fwrite failed.\n" );
2450 exit( 3 );
2451 }
2452 fflush( f );
2453
2454 if (ufdbGV.debug)
2455 fprintf( stderr, "%'ld bytes written to file\n", size );
2456
2457 /* truncate the file (if we did compression) */
2458 if ((doZLIBcompress || doBZ2compress) && size < orig_size)
2459 {
2460 if (ftruncate( fileno(f), size + hdr_size ) < 0)
2461 fprintf( stderr, "failed to truncate compressed file to size %'ld", (long) size + hdr_size );
2462 }
2463
2464 free( buffer2 );
2465 }
2466
2467
2468 #if 0
2469 static void UFDB_GCC_HOT convertSpecialCharacters( unsigned char * domain )
2470 {
2471 unsigned char * s;
2472 unsigned char * d;
2473
2474 for (s = domain, d = domain; *s != '\0'; s++)
2475 {
2476 if (*s == '%')
2477 {
2478 unsigned int hex;
2479 unsigned int h1, h2;
2480
2481 h1 = *(s+1);
2482 h2 = *(s+2);
2483 if (isxdigit(h1) && isxdigit(h2))
2484 {
2485 hex = (h1 <= '9') ? h1 - '0' : h1 - 'a' + 10;
2486 hex *= 16;
2487 hex += (h2 <= '9') ? h2 - '0' : h2 - 'a' + 10;
2488 if (hex == 0)
2489 {
2490 s += 2;
2491 continue;
2492 }
2493 else if (hex <= 0x20)
2494 {
2495 if (hex != '\t' && hex != '\r' && hex != '\n' && hex != '\f')
2496 hex = ' ';
2497 }
2498 else
2499 {
2500 if (hex == 0x7f || hex == 0xff)
2501 hex = ' ';
2502 else
2503 if (hex <= 'Z' && hex >= 'A')
2504 hex += 'a' - 'A';
2505 }
2506 *d++ = hex;
2507 s += 2;
2508 }
2509 else
2510 *d++ = *s;
2511 }
2512 else
2513 {
2514 *d++ = *s;
2515 }
2516 }
2517 *d = '\0';
2518 }
2519 #endif
2520
2521
calcpathlen(char * p)2522 static int calcpathlen( char * p )
2523 {
2524 int len = 0;
2525 while (*p != '\0')
2526 {
2527 if (*p == '%' && isxdigit( *(p+1) ) && isxdigit( *(p+2) ))
2528 p += 2;
2529 p++;
2530 len++;
2531 }
2532 return len;
2533 }
2534
2535
2536 #define MAXPARAMS 200
2537
sortstrcmp(const void * a,const void * b)2538 static int sortstrcmp( const void * a, const void * b )
2539 {
2540 unsigned char ** ppa;
2541 unsigned char ** ppb;
2542 unsigned char * pa;
2543 unsigned char * pb;
2544 int diff, va, vb;
2545
2546 ppa = (unsigned char **) a;
2547 ppb = (unsigned char **) b;
2548 pa = *ppa;
2549 pb = *ppb;
2550
2551 diff = 0;
2552 while (*pa != '\0')
2553 {
2554 va = (int) *pa;
2555 vb = (int) *pb;
2556 if (va == '=')
2557 va = 0;
2558 pa++;
2559 if (vb == '=')
2560 vb = 0;
2561 diff = va - vb;
2562 pb++;
2563 if (va == 0 || vb == 0 || diff != 0)
2564 return diff;
2565 }
2566
2567 return (int) *pa - (int) *pb;
2568 }
2569
2570
sortURLparams(unsigned char * parlist)2571 static void sortURLparams( unsigned char * parlist )
2572 {
2573 char * p;
2574 char * sep;
2575 char * pp[MAXPARAMS];
2576 unsigned int i;
2577 unsigned int n;
2578
2579 if (strchr( (char*) parlist, '&' ) == NULL)
2580 return;
2581
2582 p = (char*) parlist;
2583 n = 0;
2584 while ((sep = strchr( p, '&' )) != NULL)
2585 {
2586 *sep = '\0';
2587 pp[n] = ufdbStrdup( p );
2588 p = sep + 1;
2589 n++;
2590 if (n == MAXPARAMS-1)
2591 {
2592 ufdbLogError( "cannot sort parameters of %s", parlist );
2593 for (i = 0; i < n; i++)
2594 ufdbFree( pp[i] );
2595 return;
2596 }
2597 }
2598 if (n == 0)
2599 return;
2600 /* assign the last parameter to pp */
2601 pp[n] = ufdbStrdup( p );
2602
2603 ++n;
2604
2605 qsort( pp, (size_t) n, (size_t) sizeof(char*), sortstrcmp );
2606
2607 p = (char*) parlist;
2608 for (i = 0; i < n; i++)
2609 {
2610 p += sprintf( p, "%s", pp[i] );
2611 if (i < n-1)
2612 *p++ = '&';
2613 ufdbFree( pp[i] );
2614 }
2615 }
2616
2617
2618 UFDB_GCC_HOT
main(int argc,char * argv[])2619 int main( int argc, char * argv[] )
2620 {
2621 int n;
2622 int opt;
2623 time_t now;
2624 struct tm tm;
2625 unsigned char * d;
2626 char encryptKey[16+1];
2627 char key[16+3+1];
2628 char flags[8+1];
2629 FILE * fout;
2630 char * fout_buffer;
2631 UFDBthreadAdmin * admin;
2632 int hdr_size;
2633 struct UFDBfileHeader21 header;
2634 char date[64];
2635 char outFileName[512];
2636 char tempOutFileName[512];
2637 unsigned char domain[4096];
2638
2639 #if UFDB_DPDK_SUPPORT
2640 int retval;
2641 char * dummy[] = { "ufdbGenTable", NULL };
2642 retval = rte_eal_init( 0, dummy );
2643 if (retval < 0)
2644 rte_panic( "cannot initialize RTE EAL\n" );
2645 #endif
2646
2647 UFDBinitializeGV( &ufdbGV );
2648 #if UFDBSS_SQUID
2649 strcpy( ufdbGV.progname, "ufdbGenTable" );
2650 #elif UFDBSS_RESTAPI
2651 strcpy( ufdbGV.progname, "ufdbRESTGenTable" );
2652 #elif UFDBSS_QS
2653 strcpy( ufdbGV.progname, "ufdbqsGenTable" );
2654 #else
2655 strcpy( ufdbGV.progname, "ufdbGenTable" );
2656 #endif
2657 UFDBappInit();
2658 admin = UFDBallocThreadAdmin();
2659 domainsFileName = NULL;
2660 urlsFileName = NULL;
2661 tableName = "defaulttable";
2662 date[0] = flags[0] = key[0] = encryptKey[0] = '\0';
2663
2664 while ((opt = getopt( argc, argv, "BLDF:k:t:d:u:UNnCqpPsSVW?XZ" )) > 0)
2665 {
2666 switch (opt)
2667 {
2668 case 'B':
2669 case 'L':
2670 fprintf( stderr, "-B and -L options are ignored. The generated table has an L format\n" );
2671 // endian = opt;
2672 break;
2673 case 'D':
2674 ufdbGV.debug++;
2675 break;
2676 case 'F':
2677 format = optarg;
2678 if (!strmatch3( format, "2.0" ) &&
2679 !strmatch3( format, "2.1" ) &&
2680 !strmatch3( format, "2.2" )
2681 #if UFDB_DBFORMAT_3
2682 && !strmatch3( format, "3.0" )
2683 #endif
2684 )
2685 {
2686 #if UFDB_DBFORMAT_3
2687 fprintf( stderr, "-F option only accepts 2.0, 2.1, 2.2 and 3.0 as file format specifiers\n" );
2688 #else
2689 fprintf( stderr, "-F option only accepts 2.0, 2.1 and 2.2 as file format specifiers\n" );
2690 #endif
2691 usage();
2692 }
2693 if (strmatch3( format, "2.0" ) ||
2694 strmatch3( format, "2.1" ))
2695 {
2696 utf8support = 0;
2697 }
2698 break;
2699 case 't':
2700 tableName = optarg;
2701 break;
2702 case 'd':
2703 domainsFileName = optarg;
2704 break;
2705 case 's':
2706 doSanityCheck = 1;
2707 break;
2708 case 'S':
2709 printStatistics = 1;
2710 break;
2711 case 'u':
2712 urlsFileName = optarg;
2713 break;
2714 case 'U':
2715 urlsIncluded = 1;
2716 break;
2717 case 'k':
2718 strncpy( encryptKey, optarg, 16 );
2719 encryptKey[16] = '\0';
2720 if (strlen( encryptKey ) != 16)
2721 {
2722 fprintf( stderr, "key \"%s\" is not valid.\n", encryptKey );
2723 usage();
2724 }
2725 break;
2726 case 'n':
2727 doCrypt = 0;
2728 break;
2729 case 'N':
2730 utf8support = 0;
2731 break;
2732 case 'p':
2733 doPadding = 1;
2734 break;
2735 case 'P':
2736 doProd = 1;
2737 break;
2738 case 'C':
2739 #if UFDB_BZ2LIB_SUPPORT
2740 doBZ2compress = 1;
2741 #else
2742 fprintf( stderr, "bzip2 compression is not supported on this platform.\n" );
2743 exit( 4 );
2744 #endif
2745 break;
2746 case 'q':
2747 doWarnings = 0;
2748 break;
2749 case 'V':
2750 printf( "%s version " UFDB_VERSION "\n", ufdbGV.progname );
2751 printf( "Copyright (C) 2005-2020 by URLfilterDB B.V.\n" );
2752 exit( 0 );
2753 case 'W':
2754 stripWWW = 1;
2755 break;
2756 case 'X':
2757 skipOptimisations = 1;
2758 break;
2759 case 'Z':
2760 doZLIBcompress = 1;
2761 break;
2762 case '?':
2763 fprintf( stderr, "help:\n" );
2764 usage();
2765 break;
2766 default:
2767 fprintf( stderr, "internal error: getopt returned \"%c\"\n", opt );
2768 usage();
2769 break;
2770 }
2771 }
2772
2773 if (strlen(tableName) > 15)
2774 {
2775 tableName[15] = '\0';
2776 fprintf( stderr, "warning: the tableName is truncated to \"%s\"\n", tableName );
2777 }
2778
2779 if (domainsFileName == NULL)
2780 {
2781 fprintf( stderr, "the input file name is not specified: use the -d option\n" );
2782 usage();
2783 }
2784
2785 if (doZLIBcompress && doBZ2compress)
2786 {
2787 fprintf( stderr, "use -C or -Z but not both.\n" );
2788 usage();
2789 }
2790
2791 fin = fopen( domainsFileName, "r" );
2792 if (fin == NULL)
2793 {
2794 char strbuf[128];
2795 strerror_r( errno, strbuf, sizeof(strbuf) );
2796 fprintf( stderr, "cannot read from \"%s\": %s\n", domainsFileName, strbuf );
2797 usage();
2798 }
2799 if (ufdbGV.debug)
2800 fprintf( stderr, "processing domains from file \"%s\"\n", domainsFileName );
2801
2802 strcpy( outFileName, domainsFileName );
2803 strcat( outFileName, UFDBfileSuffix );
2804
2805 strcpy( tempOutFileName, outFileName );
2806 strcat( tempOutFileName, ".temp" );
2807
2808 fout = fopen( tempOutFileName, "w+" );
2809 if (fout == NULL)
2810 {
2811 char strbuf[128];
2812 strerror_r( errno, strbuf, sizeof(strbuf) );
2813 fprintf( stderr, "cannot write to \"%s\": %s\n", tempOutFileName, strbuf );
2814 usage();
2815 }
2816 if (ufdbGV.debug)
2817 fprintf( stderr, "opened temporary file \"%s\"\n", tempOutFileName );
2818 fout_buffer = (char *) malloc( 64*1024 );
2819 setvbuf( fout, fout_buffer, _IOFBF, 64*1024 );
2820
2821 /* setlinebuf( stderr ); */
2822 initTable( tableName );
2823
2824 URLparamSupport = 0;
2825 if ((format[0] == '2' && format[2] >= '2') || format[0] >= '3')
2826 URLparamSupport = 1;
2827
2828
2829 /* process the domains ********************************************/
2830 n = 0;
2831 readdomains:
2832 while (!feof(fin))
2833 {
2834 int last_char;
2835 unsigned char * ptr;
2836
2837 ptr = domain;
2838
2839 while ((*ptr = last_char = myfast_getc(fin)) != '\n')
2840 {
2841 /* check for a last line without \n */
2842 if (last_char == EOF)
2843 {
2844 if (ptr != domain)
2845 break;
2846 goto eof;
2847 }
2848 if (last_char == '\r')
2849 continue;
2850 ptr++;
2851 if (ptr > &domain[4090])
2852 {
2853 *ptr = '\0';
2854 fprintf( stderr, "line too long: %s\n", domain );
2855 while (!feof(fin) && myfast_getc(fin) != '\n')
2856 ;
2857 goto readdomains;
2858 }
2859 }
2860 *ptr = '\0';
2861
2862 d = domain;
2863 while (*d == ' ')
2864 d++;
2865
2866 if (d[0] != '\0' && d[0] != '#')
2867 {
2868 char * first_slash;
2869
2870 d = skipProtocol( d );
2871 if (d == NULL)
2872 d = domain;
2873
2874 /* TODO: use has_tld(domain+4) */
2875 if (d[0] == 'w' && d[1] == 'w' && d[2] == 'w' && d[3] == '.' && strchr( (char*) d+4, '.' ) != NULL)
2876 {
2877 if (stripWWW)
2878 {
2879 if (doWarnings)
2880 fprintf( stderr, "notice: \"www.\" is stripped for %s\n", d );
2881 }
2882 else if (doWarnings)
2883 {
2884 fprintf( stderr, "warning: domain name starts with \"www.\": %s (use -W option ?)\n", d );
2885 }
2886 }
2887
2888 if (doWarnings)
2889 {
2890 if (ptr - d > 66)
2891 fprintf( stderr, "warning: long domain name: %s\n", d );
2892 }
2893
2894 if (!urlsIncluded)
2895 {
2896 first_slash = strchr( (char *) d, '/' );
2897 if (first_slash != NULL)
2898 fprintf( stderr, "warning: domain name (%s) has a '/'\n", d );
2899 }
2900
2901 if (UFDBsanityCheckDomainname( (char *) d ))
2902 addDomain( admin, (char *) d, UFDBdomain );
2903 }
2904 }
2905 eof:
2906 fclose( fin );
2907
2908 /* process the urls ***********************************************/
2909 if (urlsFileName != NULL)
2910 {
2911 fin = fopen( urlsFileName, "r" );
2912 if (fin == NULL)
2913 {
2914 char strbuf[128];
2915 strerror_r( errno, strbuf, sizeof(strbuf) );
2916 fprintf( stderr, "cannot read from \"%s\": %s\n", urlsFileName, strbuf );
2917 fclose( fout );
2918 unlink( tempOutFileName );
2919 usage();
2920 }
2921 if (ufdbGV.debug)
2922 fprintf( stderr, "processing urls from file \"%s\"\n", urlsFileName );
2923
2924 readurls:
2925 while (!feof(fin))
2926 {
2927 int last_char;
2928 unsigned char * qm;
2929 unsigned char * ptr;
2930 unsigned char * first_slash;
2931
2932 qm = NULL;
2933 ptr = domain;
2934
2935 while ((*ptr = last_char = myfast_getc(fin)) != '\n')
2936 {
2937 /* check for a last line without \n */
2938 if (last_char == EOF)
2939 {
2940 if (ptr != domain)
2941 break;
2942 goto eof2;
2943 }
2944 if (last_char == '\r') /* Skip '\r' */
2945 continue;
2946 if (qm == NULL && last_char == '?') /* remember the first question mark */
2947 qm = ptr;
2948 if (last_char < ' ') /* illegal control character in URL */
2949 {
2950 if (doWarnings)
2951 fprintf( stderr, "illegal control character in URL: %s\n", domain );
2952 *ptr = '\0';
2953 while (!feof(fin) && myfast_getc(fin) != '\n')
2954 ;
2955 break;
2956 }
2957 ptr++;
2958 if (ptr > &domain[4090])
2959 {
2960 *ptr = '\0';
2961 fprintf( stderr, "URL too long: %s\n", domain );
2962 while (!feof(fin) && myfast_getc(fin) != '\n')
2963 ;
2964 goto readurls;
2965 }
2966 }
2967 *ptr = '\0';
2968
2969 d = domain;
2970 while (*d == ' ')
2971 d++;
2972 d = skipProtocol( d );
2973 if (d == NULL)
2974 d = domain;
2975
2976 if (d[0] != '\0' && d[0] != '#')
2977 {
2978 first_slash = (unsigned char *) strchr( (char *) d, '/' );
2979 if (first_slash == NULL)
2980 {
2981 if (doWarnings)
2982 {
2983 fprintf( stderr, "warning: URL has no '/': %s\n", d );
2984 if (strlen( (char *) d ) > 66)
2985 fprintf( stderr, "warning: long domainname in URL: %s\n", d );
2986 }
2987 }
2988 else
2989 {
2990 int pathlen;
2991
2992 if (qm != NULL)
2993 {
2994 /* make sure that the database format is 2.2+ when parameters are used */
2995 if (!URLparamSupport)
2996 {
2997 if (doWarnings)
2998 fprintf( stderr, "warning: URL with parameters is only supported in format 2.2 and newer. URL: %s\n", d );
2999 numWarnings++;
3000 *qm = '\0';
3001 qm = NULL;
3002 }
3003 else
3004 {
3005 if (*(qm+1) == '\0')
3006 {
3007 if (doWarnings)
3008 fprintf( stderr, "warning: URL with '?' has no parameters and '?' is removed from URL %s\n", d );
3009 *qm = '\0';
3010 qm = NULL;
3011 }
3012 }
3013 }
3014
3015 pathlen = calcpathlen( (char *) first_slash );
3016 if (qm != NULL)
3017 {
3018 unsigned int params_len;
3019
3020 /* TO-DO: the following check on the length of all parameters is a bit limiting
3021 * TO-DO: instead must check length of each parameter and each value.
3022 */
3023 params_len = calcpathlen( (char *) qm );
3024 pathlen -= params_len;
3025 if (params_len >= sizeof(UFDBurlPart))
3026 {
3027 if (doWarnings)
3028 fprintf( stderr, "warning: parameter list exceeds %d characters and may be too long: %s\n",
3029 (int) sizeof(UFDBurlPart)-1, d );
3030 }
3031 sortURLparams( qm+1 );
3032 }
3033 if (doWarnings)
3034 {
3035 if (first_slash - d > 66)
3036 fprintf( stderr, "warning: long domainname in URL: %s\n", d );
3037 if (pathlen > 127)
3038 fprintf( stderr, "warning: long path in URL: %s\n", d );
3039 else if (pathlen >= (int) sizeof(UFDBurlPart))
3040 fprintf( stderr, "warning: very long URL: %s\n", d );
3041 }
3042 }
3043
3044 /* does URL start with "www." ? */
3045 if (d[0] == 'w' && d[1] == 'w' && d[2] == 'w' && d[3] == '.' && strchr( (char*) d+4, '.' ) != NULL)
3046 {
3047 if (stripWWW)
3048 {
3049 if (doWarnings)
3050 fprintf( stderr, "notice: \"www.\" is stripped for %s\n", d );
3051 }
3052 else if (doWarnings)
3053 {
3054 fprintf( stderr, "warning: URL name starts with \"www.\": %s (use -W option ?)\n", d );
3055 }
3056 }
3057
3058 if (UFDBsanityCheckDomainname( (char *) d ))
3059 addDomain( admin, (char *) d, UFDBurl );
3060 }
3061 }
3062 eof2:
3063 fclose( fin );
3064 }
3065
3066 if (encryptKey[0] == '\0')
3067 generateRandomKey( encryptKey );
3068 copyKey( key, encryptKey );
3069
3070 /* write the table in binary format to the memory buffer */
3071 init_membuf();
3072 if (strmatch3( format, "2.0" ))
3073 {
3074 DEBUG(( stderr, "writing table in DB 2.0 format\n" ));
3075 createMemTable_2_0( table );
3076 }
3077 else if (strmatch3( format, "2.1" ))
3078 {
3079 DEBUG(( stderr, "writing table in DB 2.1 format\n" ));
3080 createMemTable_2_1( table );
3081 }
3082 else if (strmatch3( format, "2.2" ))
3083 {
3084 DEBUG(( stderr, "writing table in DB 2.2 format\n" ));
3085 createMemTable_2_2( table );
3086 }
3087 #if UFDB_DBFORMAT_3
3088 else if (strmatch3( format, "3.0" ))
3089 {
3090 DEBUG(( stderr, "writing table in DB 3.0 format\n" ));
3091 createMemTable_3_0( table, 0, UFDBunknownType, 0 );
3092 }
3093 #endif
3094 mem_putc( UFDBendTable );
3095
3096 /* when SSE and AVX/AVX2 instructions are used we need a 32-byte safeguard */
3097 for (n = 0; n < 32; n++)
3098 mem_putc( UFDBendTable );
3099
3100 #if 0
3101 /* various performance tests have showed that using strcmp() or SSE4.2 is slower than byte-by-byte
3102 * string comparison so strcmp() is not used in the performance-critical strcmpURLpart().
3103 */
3104 mem_puts( (unsigned char *) "64 padding characters since strcmp might use 256-bit lookahead..." );
3105 #endif
3106
3107 if (format[0] < '3')
3108 numIndexNodes = numNodes - numLeafNodes;
3109
3110 /* write the table header to the output file */
3111 strcpy( flags, "--------" );
3112 if (doBZ2compress)
3113 flags[0] = 'C';
3114 if (doZLIBcompress)
3115 flags[0] = 'Z';
3116 if (doProd)
3117 flags[1] = 'P';
3118 if (doCrypt)
3119 flags[2] = 'Q';
3120 if (doPadding)
3121 flags[3] = 'p';
3122 flags[4] = endian;
3123 now = time( NULL );
3124 gmtime_r( &now, &tm );
3125 sprintf( date, "%4d%02d%02d.%02d%02d",
3126 tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, tm.tm_hour, tm.tm_min );
3127 sprintf( header.string, "%s %s %s %ld key=%s date=%s %8s %ld %d\n\n",
3128 "UFDB", format, tableName, mem_i, key, date, flags, numNodes, 0 );
3129 fprintf( fout, "%s", header.string );
3130 hdr_size = (doPadding || format[0] >= '3') ?
3131 sizeof(struct UFDBfileHeader21) : sizeof (struct UFDBfileHeader);
3132 for (n = hdr_size - strlen(header.string); n > 0; n--)
3133 myfast_putc( '\0', fout );
3134
3135 if (printStatistics || ufdbGV.debug)
3136 {
3137 fprintf( stderr, "table header: %s", header.string );
3138 fprintf( stderr, "#nodes: %9ld #leafs: %9ld #index: %9ld\n",
3139 numNodes, numLeafNodes, numIndexNodes );
3140 #if UFDB_DBFORMAT_3
3141 if (format[0] == '3')
3142 {
3143 fprintf( stderr, "#labels: %9ld #chunks: %9ld #ovrflw: %9ld #fewLab: %9ld\n",
3144 numLabelNodes, numChnksStat, numOverflow, numFewLabels );
3145 fprintf( stderr, "#path8: %9ld #path16: %9ld #pth255: %9ld #manyPath: %9ld\n",
3146 numVeryFewPaths, numFewPaths, num250Paths, numManyPaths );
3147 }
3148 #endif
3149 fprintf( stderr, "#sub1: %9ld #sub2: %9ld #sub3: %9ld #sub4: %9ld\n",
3150 numSub1, numSub2, numSub3, numSub4 );
3151 fprintf( stderr, "#sub5: %9ld #sub6: %9ld #sub7: %9ld #sub8: %9ld\n",
3152 numSub5, numSub6, numSub7, numSub8 );
3153 fprintf( stderr, "#sub8+: %9ld #sub255+:%9ld #sub64K+:%9ld\n",
3154 numSub8plus, numSub255plus, numSub64Kplus );
3155 }
3156
3157 /* encrypt and compress the table: rewind, read, compress, crypt and write */
3158 if (doCrypt || doBZ2compress || doZLIBcompress)
3159 {
3160 doCryptCompress( fout, encryptKey, format );
3161 }
3162 else
3163 {
3164 fwrite( mem, mem_i, 1, fout );
3165 }
3166
3167 fflush( fout );
3168 fdatasync( fileno(fout) );
3169 fclose( fout );
3170 free( fout_buffer );
3171
3172 /* to get around some permission problems: unlink before rename */
3173 if (unlink( outFileName ) < 0 && errno != ENOENT)
3174 {
3175 char strbuf[128];
3176 strerror_r( errno, strbuf, sizeof(strbuf) );
3177 fprintf( stderr, "cannot remove \"%s\": %s\n", outFileName, strbuf );
3178 }
3179 if (rename( tempOutFileName, outFileName ) != 0)
3180 {
3181 char strbuf[128];
3182 strerror_r( errno, strbuf, sizeof(strbuf) );
3183 fprintf( stderr, "cannot rename '%s' into '%s': %s\n", tempOutFileName, outFileName, strbuf );
3184 (void) unlink( tempOutFileName );
3185 exit( 1 );
3186 }
3187 else if (ufdbGV.debug)
3188 fprintf( stderr, "temporary file \"%s\" renamed to \"%s\"\n", tempOutFileName, outFileName );
3189
3190 if (numWarnings)
3191 fprintf( stderr, "%d warning(s) ***\n", numWarnings );
3192
3193 return 0;
3194 }
3195
3196
3197 /* since ufdbguard (single-threaded) and ufdbguardd (multi-threaded)
3198 * share source code, we put some pthread dummys here since we don't need/want pthreads.
3199 */
3200
3201 #if GCC
3202 #pragma GCC diagnostic ignored "-Wunused-parameter"
3203 #endif
3204
ufdb_mutex_lock(ufdb_mutex * m)3205 int ufdb_mutex_lock( ufdb_mutex * m __attribute__((unused)) )
3206 { return 0; }
3207
ufdb_mutex_trylock(ufdb_mutex * m)3208 int ufdb_mutex_trylock( ufdb_mutex * m __attribute__((unused)) )
3209 { return 0; }
3210
ufdb_mutex_unlock(ufdb_mutex * m)3211 int ufdb_mutex_unlock( ufdb_mutex * m __attribute__((unused)) )
3212 { return 0; }
3213
ufdb_mutex_init(ufdb_mutex * m)3214 void ufdb_mutex_init( ufdb_mutex * m __attribute__((unused)) )
3215 { ; }
3216
3217 #if 0
3218 int pthread_cond_signal( pthread_cond_t * cond __attribute__((unused)) )
3219 {
3220 return 0;
3221 }
3222
3223 int pthread_cond_wait(
3224 pthread_cond_t * cond __attribute__((unused)),
3225 pthread_mutex_t * mutex __attribute__((unused)) )
3226 {
3227 return 0;
3228 }
3229 #endif
3230
3231
3232 #ifdef __cplusplus
3233 }
3234 #endif
3235