1 /*
2  * genTable.c - URLfilterDB
3  *
4  * ufdbGuard is copyrighted (C) 2005-2020 by URLfilterDB B.V. with all rights reserved.
5  *
6  * Parts of the ufdbGuard daemon are based on squidGuard.
7  * This module is NOT based on squidGuard.
8  *
9  * Generate a binary table file (.ufdb) from unordered ASCII files
10  * with domains and urls.
11  *
12  * usage: ufdbGenTable [-F 2.0|2.1|2.2|2.3|3.0] [-V] [-n] [-C | -Z] [-X] [-k <key>] -t <tableName> -d <domains> [-u <urls>]
13  *
14  * RCS $Id: genTable.c,v 1.164 2020/10/22 07:12:36 root Exp root $
15  */
16 
17 #define UFDB_DO_DEBUG 0
18 
19 /* ufdbGenTable needs speed! */
20 #undef _FORTIFY_SOURCE
21 
22 #if UFDB_OVERRIDE_GCC_OPT  &&  !UFDB_DO_DEBUG && ((__GNUC__ > 4)  ||  (__GNUC__ == 4  &&  __GNUC_MINOR__ >= 4))
23 #pragma GCC optimize ("O3")
24 #endif
25 
26 #if !UFDB_DO_DEBUG && defined(__OPTIMIZE__) && 0
27 #define __USE_STRING_INLINES 1
28 #endif
29 
30 
31 #include "ufdb.h"
32 #include "ufdblib.h"
33 #include "ufdbdb.h"
34 #include "ufdblocks.h"
35 #if UFDBSS_RESTAPI
36 #include "ufdbstrlib.h"
37 #endif
38 
39 #include <stdio.h>
40 #include <stdarg.h>
41 #include <stdlib.h>
42 #include <string.h>
43 #include <strings.h>
44 #include <ctype.h>
45 #include <limits.h>
46 #include <time.h>
47 #include <errno.h>
48 #include <sys/types.h>
49 #include <unistd.h>
50 
51 #if UFDB_BZ2LIB_SUPPORT
52 #include "bzlib.h"
53 #endif
54 #include "zlib.h"
55 
56 #ifdef __cplusplus
57 extern "C" {
58 #endif
59 
60 #define strmatch2(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == '\0')
61 #define strmatch3(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == '\0')
62 #define strmatch4(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == '\0')
63 #define strmatch5(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  && (a)[5] == '\0')
64 #define strmatch6(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == '\0')
65 #define strmatch7(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == (b)[6]  &&  (a)[7] == '\0')
66 #define strmatch8(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == (b)[6]  &&  (a)[7] == (b)[7]  &&  (a)[8] == '\0')
67 #define strmatch9(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == (b)[6]  &&  (a)[7] == (b)[7]  &&  (a)[8] == (b)[8]  &&  (a)[9] == '\0')
68 #define strmatch10(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == (b)[6]  &&  (a)[7] == (b)[7]  &&  (a)[8] == (b)[8]  &&  (a)[9] == (b)[9]  &&  (a)[10] == '\0')
69 #define strmatch11(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == (b)[6]  &&  (a)[7] == (b)[7]  &&  (a)[8] == (b)[8]  &&  (a)[9] == (b)[9]  &&  (a)[10] == (b)[10]  &&  (a)[11] == '\0')
70 #define strmatch12(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == (b)[6]  &&  (a)[7] == (b)[7]  &&  (a)[8] == (b)[8]  &&  (a)[9] == (b)[9]  &&  (a)[10] == (b)[10]  &&  (a)[11] == (b)[11]  &&  (a)[12] == '\0')
71 #define strmatch13(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == (b)[6]  &&  (a)[7] == (b)[7]  &&  (a)[8] == (b)[8]  &&  (a)[9] == (b)[9]  &&  (a)[10] == (b)[10]  &&  (a)[11] == (b)[11]  &&  (a)[12] == (b)[12]  &&  (a)[13] == '\0')
72 
73 
74 /* This is an attempt to define a new algorithm for generating a table.
75  * The reason for the need of a new algorithm is performance.
76  * In December 2013 the adult category has 3076456 domains, 276707 urls,
77  * 3353163 leaf nodes and 207536 index nodes:
78  * #sub1:     156411   #sub2:      23879   #sub3:      11226   #sub4:       4923
79  * #sub5:       2275   #sub6:       1269   #sub7:        847
80  * #sub8+:      6443   #sub255+:     260   #sub64K+:       3
81  *
82  * On a Intel E5-2420 processor ufdbGenTable takes 32 seconds to generate a .ufdb table file
83  * where 3 seconds are needed for composing an in-memory table of the 3 million domains
84  * and 29 seconds are needed to insert the 275000 urls.
85  * The total time to generate is too much and will only grow when the adult table grows.
86  * The time to insert the relatively small amount of urls is simply too much and
87  * caused by inserting elements in an array with 500000+ elements.  The array shift
88  * is taking 90+ per cent of all CPU time.
89  * The .com TLD has 539320 child nodes and the tumblr.com domain has 691892 child nodes.
90  * Note that the L3 cache of the E5-2420 is 15 MB and that 700000 * 32 bytes (array element size)
91  * is 21.3 MB and hence significantly larger than the L3 cache.
92  *
93  * The input files are 99% sorted so processing them uses a lot of fast tail insertions
94  * except for IP addresses.  Therefore, processing the domains file is fast.
95  * The processing of the urls file is slow since it causes inserts at random points
96  * in the table and therefore causes a lot of array shifts.
97  *
98  * The new algorithm must be much faster: the target is to generate the adult table
99  * in under 5 seconds.
100  *
101  * The new data structure is based upon the old one with the difference that
102  * the large array will be divided in a list of small arrays which are fast to manipulate.
103  * The list of arrays is a kind of a btree.  Each array has a variable size with a maximum of N.
104  * N is choosen to be 2000 since 2000*32 = 64000 and is sufficiently small to not
105  * cause much stress for the L2 cache (the L2 cache size is often 256K).
106  * The array size must be small to make sure that an insert in the array is fast so
107  * maybe N must be reduced to 1000 or less.  Experiments with different values were done
108  * and showed a slight performance degradation with smaller and higher values.
109  * Each array is filled and when it becomes full, it is split into 2 arrays where
110  * the two new arrays each have 50% of the elements of the split arrays, EXCEPT
111  * when the array is the last array of the list.  Since there is a lot of tail insertion,
112  * the last array of the list, when it becomes full, is split into one array with N-1
113  * elements and the new last array will have 1 element.
114  *
115  * With N=2000 and a list of arrays with a total of 700000 elements has on average
116  * 700 arrays.
117  *
118  * The data structure used by ufdbGenTable is for ufdbGenTable only and is not used by
119  * the database load functions of ufdbguardd.
120  *
121  * The results are good:
122  * ufdbGentable version   time (s)
123  * 1.31                   32.5
124  * 1.32 new algorithm      3.0
125  */
126 
127 #define MinNodesPerSTA		   1
128 #define MaxNodesPerSTA		2000
129 
130 struct UFDBgentableNode;
131 struct UFDBgentableSTA;
132 
133 
134 struct UFDBgentableNode						/* 32 bytes */
135 {
136    unsigned int               totalSubNodes;			/* sum(stas.nSubNodes) */
137    unsigned int               nSTA;				/* number of STAs */
138    unsigned char *            tag;				/* tag */
139    struct UFDBgentableSTA *   stas;				/* Subnodes Table Arrays (STAs) */
140 };
141 
142 struct UFDBgentableSTA						/* 16 bytes */
143 {
144    unsigned int               nSubNodes;			/* #used nodes; n<nodeArraySize */
145    unsigned int               nodeArraySize;			/* 0, 1, 8, 256 or 2000 */
146    struct UFDBgentableNode *  subNodes;				/* not a pointer but an array! */
147 };
148 
149 static FILE * fin;
150 static char * domainsFileName;
151 static char * urlsFileName;
152 static char * tableName;
153 static char   endian = 'L';
154 
155 static struct UFDBgentableNode * table = NULL;
156 
157 #if 0
158 int    ufdbGV.debug = 0;
159 #endif
160 
161 static int  printStatistics = 0;
162 static int  utf8support = 1;
163 static long numEntries = 0;
164 static long numNodes = 0;
165 static long numLeafNodes = 0;
166 static long numSub1 = 0;
167 static long numSub2 = 0;
168 static long numSub3 = 0;
169 static long numSub4 = 0;
170 static long numSub5 = 0;
171 static long numSub6 = 0;
172 static long numSub7 = 0;
173 static long numSub8 = 0;
174 static long numSub8plus = 0;
175 static long numSub255plus = 0;
176 static long numSub64Kplus = 0;
177 static long numIndexNodes = 0;
178 #if UFDB_DBFORMAT_3
179 static long numLabelNodes = 0;
180 static long numChnksStat = 0;
181 static long numOverflow = 0;
182 static long numFewLabels = 0;
183 static long numVeryFewPaths = 0;
184 static long numFewPaths = 0;
185 static long num250Paths = 0;
186 static long numManyPaths = 0;
187 #endif
188 static int  numWarnings = 0;
189 static int  doCrypt = 1;
190 static int  doBZ2compress = 0;
191 static int  doZLIBcompress = 0;
192 static int  doProd = 0;
193 static int  doPadding = 0;
194 static int  doWarnings = 1;
195 static int  doSanityCheck = 1;
196 static int  urlsIncluded = 1;
197 static int  stripWWW = 0;
198 static int  skipOptimisations = 0;
199 static int  URLparamSupport = 0;
200 static char * format = UFDBdefaultdbVersion;
201 
202 #if UFDB_DO_DEBUG || 0
203 #define DEBUG(x) fprintf x
204 #else
205 #define DEBUG(x)
206 #endif
207 
208 #define ROUNDUPBY      16
209 #define ROUNDUP(i)     ( (i) + (ROUNDUPBY - ((i)%ROUNDUPBY) ) )
210 
211 #define BIGROUNDUPBY   128
212 #define BIGROUNDUP(i)  ( (i) + (BIGROUNDUPBY - ((i)%BIGROUNDUPBY) ) )
213 
214 #define ROUNDUPBYCUTOFF BIGROUNDUPBY
215 
216 #include "strcmpurlpart.static.c"
217 
218 
219 
220 #if HAVE_PUTC_UNLOCKED
221 
222 #define myfast_putc(c,fp)   putc_unlocked(c,fp)
223 #define myfast_getc(fp)     getc_unlocked(fp)
224 
225 #if defined(__linux__) && defined(_GNU_SOURCE)
226 #define fast_puts(s,fp)     fputs_unlocked(s,fp)
227 #else
fast_puts(const char * s,FILE * fp)228    UFDB_GCC_INLINE int fast_puts( const char * s, FILE * fp )
229    {
230       int retval;
231 
232       retval = 1;
233       while (*s != '\0'  &&  ((retval = myfast_putc(*s,fp)) != EOF))
234 	 s++;
235 
236       return retval;
237    }
238 #endif
239 
240 #else
241 
242 #define myfast_putc(c,fp)  fputc(c,fp)
243 #define fast_puts(s,fp)    fputs(s,fp)
244 
245 #define myfast_getc(fp)    fgetc(fp)
246 #endif
247 
248 #define MAXLOGMSGSIZE      (60*1024)
249 
250 
usage(void)251 static void usage( void )
252 {
253    fprintf( stderr, "usage: %s [-nNqV] [-C | -Z] [-k <key>] -t <tableName> -d <domains> [-u <urls>]\n",
254             ufdbGV.progname );
255    fprintf( stderr, "flags: -n  no encryption\n" );
256    fprintf( stderr, "       -k  16-char encryption key\n" );
257 #if UFDB_DBFORMAT_3
258    fprintf( stderr, "       -F  2.2|3.0 (default is %s)\n", UFDBdefaultdbVersion );
259 // fprintf( stderr, "  -B | -L  generate big endian or little endian file format (default -L)\n" );
260 #else
261    fprintf( stderr, "       -F  2.0|2.1|2.2 (default is %s)\n", UFDBdefaultdbVersion );
262 #endif
263    fprintf( stderr, "       -t  tablename\n" );
264    fprintf( stderr, "       -d  domains\n" );
265    fprintf( stderr, "       -u  urls\n" );
266    fprintf( stderr, "       -U  domains and URLs are in the file specified by -d\n" );
267 #if UFDB_BZ2LIB_SUPPORT
268    fprintf( stderr, "       -C  use bzip2 compression (relatively slow)\n" );
269 #endif
270    fprintf( stderr, "       -D  debug\n" );
271    fprintf( stderr, "       -N  NO UTF8 support (skip URLs with UTF8 characters)\n" );
272    fprintf( stderr, "       -q  be quiet (suppress warnings)\n" );
273    fprintf( stderr, "       -s  sanity check for domain names (obsolete option - check is always done)\n" );
274    fprintf( stderr, "       -S  print table statistics\n" );
275    fprintf( stderr, "       -V  print version (" UFDB_VERSION ") and exit\n" );
276    fprintf( stderr, "       -W  strip \"www.\" from URLs\n" );
277    fprintf( stderr, "       -X  skip table optimisations - only for expert users\n" );
278    fprintf( stderr, "       -Z  use zlib compression - up to 5 times faster than bzip2 but slightly less compression\n" );
279    exit( 1 );
280 }
281 
282 
randomChar(void)283 UFDB_GCC_INLINE static unsigned char randomChar( void )
284 {
285    static unsigned char * a = (unsigned char *) "1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
286    return a[random() % 62];
287 }
288 
289 
randomisebuf64(unsigned char * buf)290 static void randomisebuf64( unsigned char * buf )
291 {
292    int    n;
293    unsigned int   seed;
294    FILE * fp;
295 
296    seed = 0x05ac7326;
297 
298    if (doProd)
299    {
300       fp = popen( "who", "r" );
301       if (fp != NULL)
302       {
303 	 int ch;
304 	 while ((ch = fgetc(fp)) != EOF)
305 	 {
306 	    seed = (seed >> 11) + 23 + 7 * (seed ^ ch);
307 	 }
308 	 fclose( fp );
309       }
310    }
311 
312 #if 0
313    printf( "random seed is %08x\n", seed );
314 #endif
315 
316    seed = (seed + 1309) ^ (((getpid() << 3) ^ (time(NULL) << 19)) + (getppid() << 26));
317    srandom( seed );
318 
319    for (n = 0; n < 64; n++)
320    {
321       *buf++ = randomChar();
322    }
323 }
324 
325 
ufdbLogError(const char * format,...)326 void ufdbLogError( const char * format, ... )
327 {
328    va_list ap;
329    char    msg[MAXLOGMSGSIZE];
330 
331    va_start( ap, format );
332    if (vsnprintf(msg, MAXLOGMSGSIZE-1, format, ap) > (MAXLOGMSGSIZE - 2))
333       msg[MAXLOGMSGSIZE-1] = '\0';
334    va_end( ap );
335 
336    fprintf( stderr, "ERROR: %s\n", msg );
337 }
338 
339 
ufdbLogMessage(const char * format,...)340 void ufdbLogMessage( const char * format, ... )
341 {
342    va_list ap;
343    char    msg[MAXLOGMSGSIZE];
344 
345    va_start( ap, format );
346    if (vsnprintf(msg, MAXLOGMSGSIZE-8, format, ap) >= (MAXLOGMSGSIZE - 8))
347       msg[MAXLOGMSGSIZE-8] = '\0';
348    va_end( ap );
349 
350    fprintf( stderr, "%s\n", msg );
351 }
352 
353 
ufdbLogFatalError(const char * format,...)354 void ufdbLogFatalError( const char * format, ... )
355 {
356    va_list ap;
357    char    msg[MAXLOGMSGSIZE];
358 
359    va_start( ap, format );
360    if (vsnprintf(msg, MAXLOGMSGSIZE-1, format, ap) > (MAXLOGMSGSIZE - 2))
361       msg[MAXLOGMSGSIZE-1] = '\0';
362    va_end( ap );
363 
364    fprintf( stderr, "FATAL ERROR: %s  *****\n", msg );
365 }
366 
367 
ufdbSetGlobalErrorLogFile(char * logdir,char * basename,int mutex_is_used)368 void ufdbSetGlobalErrorLogFile(
369    char * logdir         __attribute__((unused)),
370    char * basename       __attribute__((unused)),
371    int    mutex_is_used  __attribute__((unused)) )
372 {
373    if (logdir == NULL) {;}    // prevent compiler warning
374    if (basename == NULL) {;}  // prevent compiler warning
375    if (mutex_is_used) {;}     // prevent compiler warning
376 }
377 
378 
skipProtocol(unsigned char * URL)379 inline static unsigned char * skipProtocol( unsigned char * URL )
380 {
381    int i;
382 
383    i = strcspn( (char*) URL, ".:/?@#%" );
384    if (*(URL+i) == ':'  &&  *(URL+i+1) == '/'  &&  *(URL+i+2) == '/')
385       return URL + i + 3;
386    return NULL;
387 }
388 
389 
gentableVerifyParameters(struct UFDBgentableNode * node,UFDBrevURL * revUrl)390 inline static int gentableVerifyParameters(
391    struct UFDBgentableNode * node,
392    UFDBrevURL *              revUrl )
393 {
394    // prevent compiler warnings:
395    if (node == NULL) {;}
396    if (revUrl == NULL) {;}
397 
398    // just assume that parameters are always different
399    return 0;
400 }
401 
402 
403 /* perform lookup of revUrl in the table pointed by its top node.
404  * return 1 iff found, 0 otherwise.
405  */
406 UFDB_GCC_HOT
UFDBgentableLookupRevUrl(struct UFDBgentableNode * node,UFDBrevURL * revUrl)407 static int UFDBgentableLookupRevUrl(
408    struct UFDBgentableNode * node,
409    UFDBrevURL *              revUrl )
410 {
411    int b, e;
412    int i;
413    int cmp;
414 
415    struct UFDBgentableSTA *  sta;
416 #if UFDB_DO_DEBUG
417    struct UFDBgentableNode * origtable = node;
418 #endif
419 
420    /* there is a 2-level search (both binary search):
421     * first, search the STA and second search within the STA.
422     */
423 
424 begin:
425    DEBUG(( stderr, "    UFDBgentableLookupRevUrl:  table %-14s [%d]  tag %14s : %s\n",
426                    origtable->tag, origtable->nSTA, node->tag, revUrl->part ));
427 
428    if (node->nSTA == 0)
429       return 0;
430 
431    i = 0;
432    e = node->nSTA - 1;
433    if (e == 0)
434    {
435       sta = &node->stas[0];
436       DEBUG(( stderr, "      use STA 0 (nnodes is 1)\n" ));
437    }
438    else
439    {
440       sta = NULL;       /* suppress compiler warning */
441       b = 0;
442       while (b <= e)
443       {
444 	 i = (b + e) / 2;
445 	 sta = &node->stas[i];
446 	 cmp = strcmpURLpart( (char *) revUrl->part, (char *) sta->subNodes[0].tag );
447 	 DEBUG(( stderr, "      last node tag in STA %-3d (nnodes is %d)  %s \n",
448                          i, sta->nSubNodes, sta->subNodes[sta->nSubNodes-1].tag ));
449 	 DEBUG(( stderr, "      first node tag in STA %-3d  strcmpURLpart( %s, %s )  is %d\n",
450                          i, (char *) revUrl->part, (char *) sta->subNodes[0].tag, cmp ));
451 	 if (cmp < 0)
452 	    e = i - 1;
453 	 else if (cmp > 0)
454 	    b = i + 1;
455 	 else
456 	    break;
457       }
458       /* the above comparisons were done against sta[i].subnodes[0].tag and
459        * we may be one STA too far so compare the tag with sta[-1].subnodes[last].tag
460        */
461       if (i > 0)
462       {
463          struct UFDBgentableSTA *  prevsta;
464          prevsta = &node->stas[i-1];
465          cmp = strcmpURLpart( (char *) revUrl->part, (char *) prevsta->subNodes[prevsta->nSubNodes-1].tag );
466          if (cmp <= 0)
467          {
468             i = i - 1;
469             sta = prevsta;
470             DEBUG(( stderr, "      tag is in previous STA (%d)", i ));
471          }
472       }
473    }
474    DEBUG(( stderr, "      UFDBgentableLookupRevUrl  STA %d:  nSubNodes %d\n", i, sta->nSubNodes ));
475 
476    // the sta has been found.  Now search inside the sta
477    b = 0;
478    e = sta->nSubNodes - 1;
479    while (b <= e)
480    {
481       i = (b + e) / 2;
482       cmp = strcmpURLpart( (char *) revUrl->part, (char *) sta->subNodes[i].tag );
483       DEBUG(( stderr, "         i:%-3d strcmpURLpart( %s, %s )  is %d\n",
484               i, (char *) revUrl->part, (char *) sta->subNodes[i].tag, cmp ));
485       if (cmp < 0)
486 	 e = i - 1;
487       else if (cmp > 0)
488 	 b = i + 1;
489       else
490       {
491          int is_path = (revUrl->part[0] == '/');
492 
493 	 node = &(sta->subNodes[i]);
494 	 if (node->totalSubNodes == 0)		/* no more levels in table -> MATCH */
495          {
496             DEBUG(( stderr, "            no more levels in table -> MATCH\n" ));
497 	    return 1;
498          }
499 
500          if (is_path  &&  node->totalSubNodes > 0)
501          {
502             DEBUG(( stderr, "            is_path: gentableVerifyParameters()\n" ));
503             return gentableVerifyParameters( node, revUrl->next );
504          }
505 
506 	 revUrl = revUrl->next;
507 	 if (revUrl == NULL)			/* no more levels in URL -> NO match */
508          {
509             DEBUG(( stderr, "            no more levels in URL -> NO match\n" ));
510 	    return 0;
511          }
512 
513 	 goto begin;
514       }
515    }
516 
517    DEBUG(( stderr, "            not found\n" ));
518    return 0;  /* not found */
519 }
520 
521 
522 static unsigned char * mem = NULL;
523 static unsigned long mem_i = 0;
524 static unsigned long mem_size = 0;
525 
init_membuf(void)526 static void init_membuf( void )
527 {
528    mem_size = 2 * 1024 * 1024;
529    mem = (unsigned char *) malloc( mem_size );
530 
531    /* starting with format 2.1 there are 64 random bytes inserted at the beginning
532     * of the crypted table as a defense against brute force decryption attempts.
533     */
534    if (doCrypt  &&  strcmp( format, "2.1") >= 0)
535    {
536       randomisebuf64( mem );
537       mem_i += 64;
538 #if UFDB_DO_DEBUG
539       if (ufdbGV.debug)
540 	 fprintf( stderr, "inserted 64 random bytes at the start of the URL table\n" );
541 #endif
542    }
543 }
544 
545 
546 UFDB_GCC_HOT
mem_putc(unsigned char c)547 static void mem_putc( unsigned char c )
548 {
549    if (mem_i == mem_size)
550    {
551       mem_size += 2 * 1024 * 1024;
552       if (mem_size > 4 * UINT_MAX)
553       {
554          ufdbLogFatalError( "mem_putc: table is too large!" );
555          exit( 1 );
556       }
557       mem = (unsigned char *) realloc( mem, mem_size );
558       if (mem == NULL)
559       {
560          fprintf( stderr, "cannot allocate %lu bytes for the in-memory table.\n", mem_size );
561 	 exit( 1 );
562       }
563    }
564    mem[mem_i++] = c;
565 }
566 
567 
568 UFDB_GCC_HOT
mem_pad(void)569 static void mem_pad( void )
570 {
571    if (mem_i + 8 >= mem_size)
572    {
573       mem_size += 2 * 1024 * 1024;
574       mem = (unsigned char *) realloc( mem, mem_size );
575       if (mem == NULL)
576       {
577          fprintf( stderr, "cannot allocate %lu bytes for the in-memory table.\n", mem_size );
578 	 exit( 1 );
579       }
580    }
581 
582    while (mem_i % 8 > 0)
583       mem[mem_i++] = UFDBpadTable;
584 }
585 
586 
587 UFDB_GCC_HOT UFDB_GCC_INLINE
mem_puts(unsigned char * s)588 static void mem_puts( unsigned char * s )
589 {
590    if (mem_i + sizeof(UFDBurlPart) >= mem_size)
591    {
592       mem_size += 2 * 1024 * 1024;
593       if (mem_size > 4 * UINT_MAX)
594       {
595          ufdbLogFatalError( "mem_puts: table is too large!" );
596          exit( 1 );
597       }
598       mem = (unsigned char *) realloc( mem, mem_size );
599       if (mem == NULL)
600       {
601          fprintf( stderr, "cannot allocate %lu bytes for the in-memory table.\n", mem_size );
602 	 exit( 1 );
603       }
604    }
605 
606    while (*s != '\0')
607       mem[mem_i++] = *s++;
608 }
609 
610 
611 UFDB_GCC_HOT UFDB_GCC_INLINE
mem_putsnt(unsigned char * s)612 static void mem_putsnt( unsigned char * s )
613 {
614    if (mem_i + sizeof(UFDBurlPart) >= mem_size)
615    {
616       mem_size += 2 * 1024 * 1024;
617       if (mem_size > 4 * UINT_MAX)
618       {
619          ufdbLogFatalError( "mem_putsnt: table is too large!" );
620          exit( 1 );
621       }
622       mem = (unsigned char *) realloc( mem, mem_size );
623       if (mem == NULL)
624       {
625          fprintf( stderr, "cannot allocate %lu bytes for the in-memory table.\n", mem_size );
626 	 exit( 1 );
627       }
628    }
629 
630    while (*s != '\0')
631       mem[mem_i++] = *s++;
632    mem[mem_i++] = '\0';
633 }
634 
635 
636 #if UFDB_DBFORMAT_3
637 UFDB_GCC_HOT
mem_alloc4(unsigned long nbytes)638 static uint64_t mem_alloc4( unsigned long nbytes )      // returns a byte offset
639 {
640    unsigned long new_mem_i = 0;
641    new_mem_i = (mem_i + 3) & ~3;    // round up to multiple of 4
642    if (new_mem_i != mem_i)
643    {
644       while (mem_i != new_mem_i)
645          mem_putc( '\0' );
646    }
647    uint64_t m = mem_i;
648    if (ufdbGV.debug)
649       ufdbLogMessage( "      mem_alloc4 %5lu bytes   byte-offset %-6lu  table-offset %lu", nbytes, m, m>>2 );
650 
651    if (mem_i + nbytes >= mem_size)
652    {
653       if (nbytes >= 2 * 1024 * 1024)
654          mem_size += (nbytes + 2 * 1024 * 1024);
655       else
656          mem_size += 2 * 1024 * 1024;
657       if (mem_size > 4 * UINT_MAX)
658       {
659          ufdbLogFatalError( "mem_alloc4: table is too large!" );
660          exit( 1 );
661       }
662       mem = realloc( mem, mem_size );
663       if (mem == NULL)
664       {
665          fprintf( stderr, "mem_alloc4: cannot allocate %lu bytes for the in-memory table.\n", mem_size );
666          exit( 1 );
667       }
668       if (ufdbGV.debug)
669          ufdbLogMessage( "mem_alloc4: realloc to size %'ld", mem_size );
670    }
671 
672    mem_i += nbytes;
673    return m;
674 }
675 #endif
676 
677 
678 #if 0
679 UFDB_GCC_HOT
680 static uint64_t mem_alloc16( unsigned long nbytes )      // returns a byte offset
681 {
682    unsigned long new_mem_i = 0;
683    new_mem_i = (mem_i + 15) & ~15;    // round up to multiple of 16
684    if (new_mem_i != mem_i)
685    {
686       while (mem_i != new_mem_i)
687          mem_putc( '\0' );
688    }
689    uint64_t m = mem_i;
690    if (ufdbGV.debug)
691       ufdbLogMessage( "      mem_alloc16 %4lu bytes   byte-offset %-6lu  table-offset %lu", nbytes, m, m>>2 );
692 
693    if (mem_i + nbytes >= mem_size)
694    {
695       // caller may request 600 KB and adding 256 KB won't do!!
696       if (nbytes >= 256 * 1024)
697          mem_size += (nbytes + 256 * 1024);
698       else if (mem_size < 1024 * 1024)
699          mem_size += 256 * 1024;
700       else
701          mem_size += 2 * 1024 * 1024;
702       if (mem_size > 4 * UINT_MAX)
703       {
704          ufdbLogFatalError( "mem_alloc16: table is too large!" );
705          exit( 1 );
706       }
707       mem = realloc( mem, mem_size );
708       if (mem == NULL)
709       {
710          fprintf( stderr, "mem_alloc16: cannot allocate %lu bytes for the in-memory table.\n", mem_size );
711          exit( 1 );
712       }
713    }
714 
715    mem_i += nbytes;
716    return m;
717 }
718 #endif
719 
720 
721 #if 0
722 #define _STRDUP(s) ufdbStrdup(s)
723 #else
724 /*
725  *  Small speed optimisation: allocate memory for strdupped strings in large blocks since they are never freed.
726  */
727 UFDB_GCC_HOT UFDB_GCC_INLINE
_STRDUP(char * s)728 static char * _STRDUP(
729    char * s )
730 {
731    static char * freeMem = NULL;
732    static char * last = NULL;
733 
734    char * f;
735    char * p;
736 
737    if ((int) (last - freeMem) < (int) sizeof(UFDBurlPart))
738    {
739       freeMem = (char *) malloc( 256 * 1024 );
740       last = freeMem + 256 * 1024 - 1;
741    }
742    f = freeMem;
743 
744    p = (char *) memccpy( freeMem, s, '\0', sizeof(UFDBurlPart) );
745    if (p == NULL)
746    {
747       freeMem += sizeof(UFDBurlPart);
748       *freeMem++ = '\0';
749    }
750    else
751       freeMem = p;
752 
753    return f;
754 }
755 #endif
756 
757 
758 UFDB_GCC_HOT
UFDBsanityCheckDomainname(char * url)759 static int UFDBsanityCheckDomainname( char * url )
760 {
761    unsigned char * s;
762    char * first_slash;
763    char * tld;
764    int    retval;
765    char * oldBracket;
766 
767 #if 0
768    fprintf( stderr, "UFDBsanityCheckDomainname: %s\n", url );
769 #endif
770 
771    if (*url == '[')			/* IPv6 address */
772    {
773       char normalisedDomain[64];
774 
775       oldBracket = strchr( url, ']' );
776       if (oldBracket == NULL)
777       {
778          fprintf( stderr, "error: IPv6 address has no closing ']': %s\n", url );
779 	 return 0;
780       }
781       *oldBracket = '\0';
782       if (UFDBparseIPv6address( url+1, normalisedDomain ) == NULL)
783       {
784 	 *oldBracket = ']';
785 	 return 0;
786       }
787       else
788       {
789 	 *oldBracket = ']';
790 	 UFDBupdateURLwithNormalisedDomain( url, normalisedDomain );
791          return 1;
792       }
793    }
794 
795    first_slash = strchr( url, '/' );
796    if (first_slash != NULL)
797       *first_slash = '\0';
798 
799    tld = strrchr( url, '.' );
800    if (tld == NULL)
801       tld = url;
802    else
803       tld++;
804 
805    retval = 1;
806    if (*tld >= '0'  &&  *tld <= '9')
807       ;
808    else
809    if (
810        strmatch2( tld, "ac" ) != 0  &&
811        strmatch2( tld, "ad" ) != 0  &&
812        strmatch3( tld, "ads" ) != 0  &&
813        strmatch5( tld, "adult" ) != 0  &&
814        strmatch2( tld, "ae" ) != 0  &&
815        strmatch4( tld, "aero" ) != 0  &&
816        strmatch2( tld, "af" ) != 0  &&
817        strmatch2( tld, "ag" ) != 0  &&
818        strmatch6( tld, "agency" ) != 0  &&
819        strmatch2( tld, "ai" ) != 0  &&
820        strmatch2( tld, "al" ) != 0  &&
821        strmatch2( tld, "am" ) != 0  &&
822        strmatch9( tld, "amsterdam" ) != 0  &&
823        strmatch2( tld, "an" ) != 0  &&
824        strmatch2( tld, "ao" ) != 0  &&
825        strmatch3( tld, "app" ) != 0  &&
826        strmatch5( tld, "apple" ) != 0  &&
827        strmatch2( tld, "aq" ) != 0  &&
828        strmatch2( tld, "ar" ) != 0  &&
829        strmatch4( tld, "arpa" ) != 0  &&
830        strmatch2( tld, "as" ) != 0  &&
831        strmatch4( tld, "asia" ) != 0  &&
832        strmatch2( tld, "at" ) != 0  &&
833        strmatch2( tld, "au" ) != 0  &&
834        strmatch5( tld, "audio" ) != 0  &&
835        strmatch2( tld, "aw" ) != 0  &&
836        strmatch2( tld, "ax" ) != 0  &&
837        strmatch2( tld, "az" ) != 0  &&
838        strmatch2( tld, "ba" ) != 0  &&
839        strmatch6( tld, "bayern" ) != 0  &&
840        strmatch2( tld, "bb" ) != 0  &&
841        strmatch2( tld, "bd" ) != 0  &&
842        strmatch2( tld, "be" ) != 0  &&
843        strmatch6( tld, "berlin" ) != 0  &&
844        strmatch2( tld, "bf" ) != 0  &&
845        strmatch2( tld, "bg" ) != 0  &&
846        strmatch2( tld, "bh" ) != 0  &&
847        strmatch6( tld, "bharat" ) != 0  &&
848        strmatch2( tld, "bi" ) != 0  &&
849        strmatch4( tld, "bike" ) != 0  &&
850        strmatch3( tld, "biz" ) != 0  &&
851        strmatch2( tld, "bj" ) != 0  &&
852        strmatch4( tld, "blue" ) != 0  &&
853        strmatch2( tld, "bm" ) != 0  &&
854        strmatch2( tld, "bn" ) != 0  &&
855        strmatch10( tld, "bnpparibas" ) != 0  &&
856        strmatch2( tld, "bo" ) != 0  &&
857        strmatch2( tld, "br" ) != 0  &&
858        strmatch8( tld, "bradesco" ) != 0  &&
859        strmatch8( tld, "brussels" ) != 0  &&
860        strmatch2( tld, "bs" ) != 0  &&
861        strmatch2( tld, "bt" ) != 0  &&
862        strmatch2( tld, "bv" ) != 0  &&
863        strmatch2( tld, "bw" ) != 0  &&
864        strmatch2( tld, "by" ) != 0  &&
865        strmatch2( tld, "bz" ) != 0  &&
866        strmatch3( tld, "bzh" ) != 0  &&
867        strmatch2( tld, "ca" ) != 0  &&
868        strmatch4( tld, "cafe" ) != 0  &&
869        strmatch6( tld, "camera" ) != 0  &&
870        strmatch6( tld, "casino" ) != 0  &&
871        strmatch3( tld, "cat" ) != 0  &&
872        strmatch2( tld, "cc" ) != 0  &&
873        strmatch2( tld, "cd" ) != 0  &&
874        strmatch6( tld, "center" ) != 0  &&
875        strmatch2( tld, "cf" ) != 0  &&
876        strmatch2( tld, "cg" ) != 0  &&
877        strmatch2( tld, "ch" ) != 0  &&
878        strmatch5( tld, "cheap" ) != 0  &&
879        strmatch2( tld, "ci" ) != 0  &&
880        strmatch2( tld, "ck" ) != 0  &&
881        strmatch2( tld, "cl" ) != 0  &&
882        strmatch4( tld, "club" ) != 0  &&
883        strmatch2( tld, "cm" ) != 0  &&
884        strmatch2( tld, "cn" ) != 0  &&
885        strmatch2( tld, "co" ) != 0  &&
886        strmatch5( tld, "coach" ) != 0  &&
887        strmatch5( tld, "codes" ) != 0  &&
888        strmatch3( tld, "com" ) != 0  &&
889        strmatch7( tld, "company" ) != 0  &&
890        strmatch4( tld, "coop" ) != 0  &&
891        strmatch7( tld, "country" ) != 0  &&
892        strmatch2( tld, "cr" ) != 0  &&
893        strmatch7( tld, "cricket" ) != 0  &&
894        strmatch2( tld, "cu" ) != 0  &&
895        strmatch2( tld, "cu" ) != 0  &&
896        strmatch2( tld, "cv" ) != 0  &&
897        strmatch2( tld, "cx" ) != 0  &&
898        strmatch2( tld, "cy" ) != 0  &&
899        strmatch2( tld, "cz" ) != 0  &&
900        strmatch6( tld, "dating" ) != 0  &&
901        strmatch2( tld, "de" ) != 0  &&
902        strmatch6( tld, "degree" ) != 0  &&
903        strmatch4( tld, "desi" ) != 0  &&
904        strmatch9( tld, "directory" ) != 0  &&
905        strmatch2( tld, "dj" ) != 0  &&
906        strmatch2( tld, "dk" ) != 0  &&
907        strmatch2( tld, "dm" ) != 0  &&
908        strmatch2( tld, "do" ) != 0  &&
909        strmatch2( tld, "do" ) != 0  &&
910        strmatch2( tld, "dz" ) != 0  &&
911        strmatch2( tld, "ec" ) != 0  &&
912        strmatch3( tld, "edu" ) != 0  &&
913        strmatch2( tld, "ee" ) != 0  &&
914        strmatch2( tld, "eg" ) != 0  &&
915        strmatch5( tld, "email" ) != 0  &&
916        strmatch2( tld, "er" ) != 0  &&
917        strmatch2( tld, "es" ) != 0  &&
918        strmatch2( tld, "et" ) != 0  &&
919        strmatch2( tld, "eu" ) != 0  &&
920        strmatch3( tld, "eus" ) != 0  &&
921        strmatch6( tld, "expert" ) != 0  &&
922        strmatch2( tld, "fi" ) != 0  &&
923        strmatch7( tld, "fitness" ) != 0  &&
924        strmatch2( tld, "fj" ) != 0  &&
925        strmatch2( tld, "fk" ) != 0  &&
926        strmatch2( tld, "fm" ) != 0  &&
927        strmatch2( tld, "fm" ) != 0  &&
928        strmatch2( tld, "fo" ) != 0  &&
929        strmatch8( tld, "football" ) != 0  &&
930        strmatch10( tld, "foundation" ) != 0  &&
931        strmatch2( tld, "fr" ) != 0  &&
932        strmatch2( tld, "fr" ) != 0  &&
933        strmatch2( tld, "ga" ) != 0  &&
934        strmatch7( tld, "gallery" ) != 0  &&
935        strmatch2( tld, "gb" ) != 0  &&
936        strmatch2( tld, "gd" ) != 0  &&
937        strmatch2( tld, "ge" ) != 0  &&
938        strmatch4( tld, "gent" ) != 0  &&
939        strmatch2( tld, "gf" ) != 0  &&
940        strmatch2( tld, "gg" ) != 0  &&
941        strmatch2( tld, "gg" ) != 0  &&
942        strmatch2( tld, "gh" ) != 0  &&
943        strmatch2( tld, "gi" ) != 0  &&
944        strmatch2( tld, "gi" ) != 0  &&
945        strmatch2( tld, "gl" ) != 0  &&
946        strmatch5( tld, "glass" ) != 0  &&
947        strmatch2( tld, "gm" ) != 0  &&
948        strmatch5( tld, "gmail" ) != 0  &&
949        strmatch2( tld, "gn" ) != 0  &&
950        strmatch4( tld, "golf" ) != 0  &&
951        strmatch3( tld, "gov" ) != 0  &&
952        strmatch2( tld, "gp" ) != 0  &&
953        strmatch2( tld, "gq" ) != 0  &&
954        strmatch2( tld, "gr" ) != 0  &&
955        strmatch8( tld, "graphics" ) != 0  &&
956        strmatch6( tld, "gratis" ) != 0  &&
957        strmatch2( tld, "gs" ) != 0  &&
958        strmatch2( tld, "gt" ) != 0  &&
959        strmatch2( tld, "gt" ) != 0  &&
960        strmatch2( tld, "gu" ) != 0  &&
961        strmatch5( tld, "guide" ) != 0  &&
962        strmatch4( tld, "guru" ) != 0  &&
963        strmatch2( tld, "gw" ) != 0  &&
964        strmatch2( tld, "gy" ) != 0  &&
965        strmatch4( tld, "help" ) != 0  &&
966        strmatch2( tld, "hk" ) != 0  &&
967        strmatch2( tld, "hm" ) != 0  &&
968        strmatch2( tld, "hn" ) != 0  &&
969        strmatch8( tld, "holdings" ) != 0  &&
970        strmatch4( tld, "host" ) != 0  &&
971        strmatch3( tld, "how" ) != 0  &&
972        strmatch2( tld, "hr" ) != 0  &&
973        strmatch2( tld, "ht" ) != 0  &&
974        strmatch2( tld, "hu" ) != 0  &&
975        strmatch2( tld, "id" ) != 0  &&
976        strmatch2( tld, "ie" ) != 0  &&
977        strmatch2( tld, "il" ) != 0  &&
978        strmatch2( tld, "im" ) != 0  &&
979        strmatch2( tld, "in" ) != 0  &&
980        strmatch4( tld, "info" ) != 0  &&
981        strmatch3( tld, "ink" ) != 0  &&
982        strmatch3( tld, "int" ) != 0  &&
983        strmatch13( tld, "international" ) != 0  &&
984        strmatch11( tld, "investments" ) != 0  &&
985        strmatch2( tld, "io" ) != 0  &&
986        strmatch2( tld, "iq" ) != 0  &&
987        strmatch2( tld, "ir" ) != 0  &&
988        strmatch2( tld, "is" ) != 0  &&
989        strmatch2( tld, "it" ) != 0  &&
990        strmatch2( tld, "je" ) != 0  &&
991        strmatch5( tld, "jetzt" ) != 0  &&
992        strmatch2( tld, "jm" ) != 0  &&
993        strmatch2( tld, "jo" ) != 0  &&
994        strmatch4( tld, "jobs" ) != 0  &&
995        strmatch2( tld, "jp" ) != 0  &&
996        strmatch6( tld, "kaufen" ) != 0  &&
997        strmatch2( tld, "ke" ) != 0  &&
998        strmatch2( tld, "kg" ) != 0  &&
999        strmatch2( tld, "kh" ) != 0  &&
1000        strmatch2( tld, "ki" ) != 0  &&
1001        strmatch2( tld, "km" ) != 0  &&
1002        strmatch2( tld, "kn" ) != 0  &&
1003        strmatch5( tld, "koeln" ) != 0  &&
1004        strmatch2( tld, "kr" ) != 0  &&
1005        strmatch2( tld, "kw" ) != 0  &&
1006        strmatch2( tld, "ky" ) != 0  &&
1007        strmatch2( tld, "kz" ) != 0  &&
1008        strmatch2( tld, "la" ) != 0  &&
1009        strmatch4( tld, "land" ) != 0  &&
1010        strmatch2( tld, "lb" ) != 0  &&
1011        strmatch2( tld, "lc" ) != 0  &&
1012        strmatch2( tld, "li" ) != 0  &&
1013        strmatch4( tld, "link" ) != 0  &&
1014        strmatch2( tld, "lk" ) != 0  &&
1015        strmatch3( tld, "lol" ) != 0  &&
1016        strmatch6( tld, "london" ) != 0  &&
1017        strmatch4( tld, "love" ) != 0  &&
1018        strmatch2( tld, "lr" ) != 0  &&
1019        strmatch2( tld, "ls" ) != 0  &&
1020        strmatch2( tld, "lt" ) != 0  &&
1021        strmatch2( tld, "lu" ) != 0  &&
1022        strmatch2( tld, "lv" ) != 0  &&
1023        strmatch2( tld, "ly" ) != 0  &&
1024        strmatch2( tld, "ma" ) != 0  &&
1025        strmatch2( tld, "mc" ) != 0  &&
1026        strmatch2( tld, "md" ) != 0  &&
1027        strmatch2( tld, "me" ) != 0  &&
1028        strmatch5( tld, "media" ) != 0  &&
1029        strmatch8( tld, "memorial" ) != 0  &&
1030        strmatch4( tld, "menu" ) != 0  &&
1031        strmatch2( tld, "mg" ) != 0  &&
1032        strmatch2( tld, "mh" ) != 0  &&
1033        strmatch3( tld, "mil" ) != 0  &&
1034        strmatch2( tld, "mk" ) != 0  &&
1035        strmatch2( tld, "ml" ) != 0  &&
1036        strmatch2( tld, "mm" ) != 0  &&
1037        strmatch2( tld, "mn" ) != 0  &&
1038        strmatch2( tld, "mo" ) != 0  &&
1039        strmatch4( tld, "mobi" ) != 0  &&
1040        strmatch6( tld, "mobile" ) != 0  &&
1041        strmatch4( tld, "moda" ) != 0  &&
1042        strmatch3( tld, "moe" ) != 0  &&
1043        strmatch2( tld, "mp" ) != 0  &&
1044        strmatch2( tld, "mr" ) != 0  &&
1045        strmatch2( tld, "ms" ) != 0  &&
1046        strmatch2( tld, "mt" ) != 0  &&
1047        strmatch2( tld, "mt" ) != 0  &&
1048        strmatch2( tld, "mu" ) != 0  &&
1049        strmatch6( tld, "museum" ) != 0  &&
1050        strmatch5( tld, "music" ) != 0  &&
1051        strmatch2( tld, "mv" ) != 0  &&
1052        strmatch2( tld, "mw" ) != 0  &&
1053        strmatch2( tld, "mx" ) != 0  &&
1054        strmatch2( tld, "my" ) != 0  &&
1055        strmatch2( tld, "mz" ) != 0  &&
1056        strmatch2( tld, "mz" ) != 0  &&
1057        strmatch2( tld, "na" ) != 0  &&
1058        strmatch4( tld, "name" ) != 0  &&
1059        strmatch2( tld, "nc" ) != 0  &&
1060        strmatch2( tld, "ne" ) != 0  &&
1061        strmatch3( tld, "net" ) != 0  &&
1062        strmatch7( tld, "network" ) != 0  &&
1063        strmatch2( tld, "nf" ) != 0  &&
1064        strmatch2( tld, "ng" ) != 0  &&
1065        strmatch3( tld, "ngo" ) != 0  &&
1066        strmatch2( tld, "ni" ) != 0  &&
1067        strmatch2( tld, "ni" ) != 0  &&
1068        strmatch5( tld, "ninja" ) != 0  &&
1069        strmatch2( tld, "nl" ) != 0  &&
1070        strmatch2( tld, "no" ) != 0  &&
1071        strmatch2( tld, "np" ) != 0  &&
1072        strmatch2( tld, "nr" ) != 0  &&
1073        strmatch3( tld, "nrw" ) != 0  &&
1074        strmatch2( tld, "nu" ) != 0  &&
1075        strmatch2( tld, "nu" ) != 0  &&
1076        strmatch2( tld, "nz" ) != 0  &&
1077        strmatch2( tld, "om" ) != 0  &&
1078        strmatch3( tld, "ong" ) != 0  &&
1079        strmatch3( tld, "ooo" ) != 0  &&
1080        strmatch3( tld, "org" ) != 0  &&
1081        strmatch3( tld, "ovh" ) != 0  &&
1082        strmatch2( tld, "pa" ) != 0  &&
1083        strmatch5( tld, "paris" ) != 0  &&
1084        strmatch5( tld, "party" ) != 0  &&
1085        strmatch2( tld, "pe" ) != 0  &&
1086        strmatch2( tld, "pf" ) != 0  &&
1087        strmatch2( tld, "pg" ) != 0  &&
1088        strmatch2( tld, "ph" ) != 0  &&
1089        strmatch8( tld, "pharmacy" ) != 0  &&
1090        strmatch5( tld, "photo" ) != 0  &&
1091        strmatch6( tld, "photos" ) != 0  &&
1092        strmatch4( tld, "pics" ) != 0  &&
1093        strmatch4( tld, "pink" ) != 0  &&
1094        strmatch2( tld, "pk" ) != 0  &&
1095        strmatch2( tld, "pl" ) != 0  &&
1096        strmatch2( tld, "pm" ) != 0  &&
1097        strmatch2( tld, "pn" ) != 0  &&
1098        strmatch5( tld, "poker" ) != 0  &&
1099        strmatch4( tld, "post" ) != 0  &&
1100        strmatch2( tld, "pr" ) != 0  &&
1101        strmatch5( tld, "press" ) != 0  &&
1102        strmatch3( tld, "pro" ) != 0  &&
1103        strmatch2( tld, "ps" ) != 0  &&
1104        strmatch2( tld, "pt" ) != 0  &&
1105        strmatch3( tld, "pub" ) != 0  &&
1106        strmatch2( tld, "pw" ) != 0  &&
1107        strmatch2( tld, "py" ) != 0  &&
1108        strmatch2( tld, "py" ) != 0  &&
1109        strmatch2( tld, "qa" ) != 0  &&
1110        strmatch2( tld, "re" ) != 0  &&
1111        strmatch3( tld, "red" ) != 0  &&
1112        strmatch7( tld, "reviews" ) != 0  &&
1113        strmatch10( tld, "restaurant" ) != 0  &&
1114        strmatch2( tld, "ro" ) != 0  &&
1115        strmatch5( tld, "rocks" ) != 0  &&
1116        strmatch2( tld, "rs" ) != 0  &&
1117        strmatch2( tld, "ru" ) != 0  &&
1118        strmatch4( tld, "ruhr" ) != 0  &&
1119        strmatch2( tld, "rw" ) != 0  &&
1120        strmatch2( tld, "sa" ) != 0  &&
1121        strmatch2( tld, "sb" ) != 0  &&
1122        strmatch2( tld, "sc" ) != 0  &&
1123        strmatch7( tld, "science" ) != 0  &&
1124        strmatch6( tld, "school" ) != 0  &&
1125        strmatch4( tld, "scot" ) != 0  &&
1126        strmatch2( tld, "sd" ) != 0  &&
1127        strmatch2( tld, "se" ) != 0  &&
1128        strmatch4( tld, "sexy" ) != 0  &&
1129        strmatch2( tld, "sg" ) != 0  &&
1130        strmatch2( tld, "sh" ) != 0  &&
1131        strmatch2( tld, "si" ) != 0  &&
1132        strmatch7( tld, "singles" ) != 0  &&
1133        strmatch2( tld, "sj" ) != 0  &&
1134        strmatch2( tld, "sk" ) != 0  &&
1135        strmatch3( tld, "ski" ) != 0  &&
1136        strmatch2( tld, "sl" ) != 0  &&
1137        strmatch2( tld, "sm" ) != 0  &&
1138        strmatch2( tld, "sn" ) != 0  &&
1139        strmatch2( tld, "so" ) != 0  &&
1140        strmatch6( tld, "social" ) != 0  &&
1141        strmatch5( tld, "sport" ) != 0  &&
1142        strmatch2( tld, "sr" ) != 0  &&
1143        strmatch2( tld, "st" ) != 0  &&
1144        strmatch2( tld, "su" ) != 0  &&
1145        strmatch2( tld, "sv" ) != 0  &&
1146        strmatch2( tld, "sx" ) != 0  &&
1147        strmatch2( tld, "sy" ) != 0  &&
1148        strmatch7( tld, "systems" ) != 0  &&
1149        strmatch2( tld, "sz" ) != 0  &&
1150        strmatch5( tld, "tatar" ) != 0  &&
1151        strmatch2( tld, "tc" ) != 0  &&
1152        strmatch2( tld, "td" ) != 0  &&
1153        strmatch4( tld, "tech" ) != 0  &&
1154        strmatch10( tld, "technology" ) != 0  &&
1155        strmatch3( tld, "tel" ) != 0  &&
1156        strmatch2( tld, "tf" ) != 0  &&
1157        strmatch2( tld, "tg" ) != 0  &&
1158        strmatch2( tld, "th" ) != 0  &&
1159        strmatch4( tld, "tips" ) != 0  &&
1160        strmatch5( tld, "tirol" ) != 0  &&
1161        strmatch2( tld, "tj" ) != 0  &&
1162        strmatch2( tld, "tk" ) != 0  &&
1163        strmatch2( tld, "tl" ) != 0  &&
1164        strmatch2( tld, "tm" ) != 0  &&
1165        strmatch2( tld, "tn" ) != 0  &&
1166        strmatch2( tld, "to" ) != 0  &&
1167        strmatch5( tld, "today" ) != 0  &&
1168        strmatch3( tld, "top" ) != 0  &&
1169        strmatch2( tld, "tp" ) != 0  &&
1170        strmatch2( tld, "tr" ) != 0  &&
1171        strmatch8( tld, "training" ) != 0  &&
1172        strmatch6( tld, "travel" ) != 0  &&
1173        strmatch2( tld, "tt" ) != 0  &&
1174        strmatch2( tld, "tv" ) != 0  &&
1175        strmatch2( tld, "tw" ) != 0  &&
1176        strmatch2( tld, "tz" ) != 0  &&
1177        strmatch2( tld, "ua" ) != 0  &&
1178        strmatch2( tld, "ug" ) != 0  &&
1179        strmatch2( tld, "uk" ) != 0  &&
1180        strmatch2( tld, "um" ) != 0  &&
1181        strmatch3( tld, "uno" ) != 0  &&
1182        strmatch2( tld, "us" ) != 0  &&
1183        strmatch2( tld, "uy" ) != 0  &&
1184        strmatch2( tld, "uz" ) != 0  &&
1185        strmatch2( tld, "va" ) != 0  &&
1186        strmatch2( tld, "vc" ) != 0  &&
1187        strmatch2( tld, "ve" ) != 0  &&
1188        strmatch5( tld, "vegas" ) != 0  &&
1189        strmatch12( tld, "versicherung" ) != 0  &&
1190        strmatch2( tld, "vg" ) != 0  &&
1191        strmatch2( tld, "vi" ) != 0  &&
1192        strmatch10( tld, "vlaanderen" ) != 0  &&
1193        strmatch2( tld, "vn" ) != 0  &&
1194        strmatch6( tld, "voyage" ) != 0  &&
1195        strmatch2( tld, "vu" ) != 0  &&
1196        strmatch4( tld, "wang" ) != 0  &&
1197        strmatch5( tld, "wales" ) != 0  &&
1198        strmatch6( tld, "webcam" ) != 0  &&
1199        strmatch7( tld, "website" ) != 0  &&
1200        strmatch2( tld, "wf" ) != 0  &&
1201        strmatch7( tld, "whoswho" ) != 0  &&
1202        strmatch4( tld, "wifi" ) != 0  &&
1203        strmatch4( tld, "wiki" ) != 0  &&
1204        strmatch4( tld, "work" ) != 0  &&
1205        strmatch5( tld, "world" ) != 0  &&
1206        strmatch4( tld, "wpad" ) != 0  &&
1207        strmatch2( tld, "ws" ) != 0  &&
1208        strmatch3( tld, "wtf" ) != 0  &&
1209        strmatch4( tld, "xn--" ) != 0  &&		/* e.g. xn--p1ai */
1210        strmatch3( tld, "xxx" ) != 0  &&
1211        strmatch3( tld, "xyz" ) != 0  &&
1212        strmatch2( tld, "ye" ) != 0  &&
1213        strmatch2( tld, "yt" ) != 0  &&
1214        strmatch2( tld, "yu" ) != 0  &&
1215        strmatch2( tld, "za" ) != 0  &&
1216        strmatch2( tld, "zm" ) != 0  &&
1217        strmatch4( tld, "zone" ) != 0  &&
1218        strmatch2( tld, "zw" ) != 0
1219        )
1220    {
1221       if (!doWarnings)
1222 	 fprintf( stderr, "warning: possibly incorrect domain name: %s\n", url );
1223       retval = 1;
1224    }
1225 
1226    /* allowed characters:
1227     * 0 - 9
1228     * A - Z    a - z
1229     * [ ] :
1230     * - . %
1231     * illegal but common: _
1232     * URL may start with "|." which is interpreted as "no subdomain".
1233     * RFC3986 is superceded by RFC5890-5895
1234     * According to the new RFCs labels can contain UTF characters
1235     */
1236    s = (unsigned char *) url;
1237    if (*s == '|'  &&  *(s+1) == '.')
1238       s += 2;
1239    for (;  *s != '\0';  s++)
1240    {
1241       if (!( (*s >= 'a' && *s <= 'z')  ||
1242              (*s >= 'A' && *s <= 'Z')  ||
1243 	     (*s >= '0' && *s <= '9')  ||
1244 	     (*s == '.'  ||  *s == '-'  ||  *s == ':'  ||  *s == '['  ||  *s == ']'  ||  *s == '%')  ||
1245 	     (*s == '_')  ||  *s >= 0x80 ))
1246       {
1247          fprintf( stderr, "error: domainname '%s' has illegal character '%c'\n", url, *s );
1248          retval = 0;
1249 	 break;
1250       }
1251    }
1252 
1253    if (first_slash != NULL)
1254       *first_slash = '/';
1255 
1256    return retval;
1257 }
1258 
1259 
initTable(char * tableName)1260 void initTable( char * tableName )
1261 {
1262    table = (struct UFDBgentableNode *) malloc( sizeof( struct UFDBgentableNode ) );
1263    table->tag = (unsigned char *) _STRDUP( tableName );
1264    table->totalSubNodes = 0;
1265    table->nSTA = 0;
1266    table->stas = NULL;
1267    numNodes++;
1268 
1269    numIndexNodes = 0;
1270 }
1271 
1272 
1273 #if 0
1274 static UFDB_GCC_MALLOC_ATTR UFDB_GCC_INLINE void * _trealloc( void * p, int n )
1275 {
1276    int nup;
1277 
1278    if (n == 2)
1279       return realloc( p, ROUNDUP(2) * sizeof(struct UFDBgentableNode) );
1280 
1281    if (n < ROUNDUPBYCUTOFF)
1282    {
1283       nup = ROUNDUP(n);
1284       if (nup == ROUNDUP(n-1))
1285          return p;
1286    }
1287    else
1288    {
1289       nup = BIGROUNDUP(n);
1290       if (nup == BIGROUNDUP(n-1))
1291          return p;
1292    }
1293 
1294    return realloc( p, nup * sizeof(struct UFDBgentableNode) );
1295 }
1296 #endif
1297 
1298 
1299 #if UFDB_OVERRIDE_GCC_OPT  &&  ((__GNUC__ > 4)  ||  (__GNUC__ == 4  &&  __GNUC_MINOR__ >= 4))
1300 #pragma GCC push_options
1301 #pragma GCC optimize ("O3")
1302 /* do not use loop unrolling since it is slower */
1303 #endif
1304 
1305 
1306 /*
1307  * UFDBinsertURL: insert revURL into table t.
1308  * return values: 1 if revURL already exists, 0 if revURL was inserted.
1309  */
1310 UFDB_GCC_HOT
UFDBinsertURL(struct UFDBgentableNode * node,UFDBrevURL * revURL,char * origURL,UFDBurlType type)1311 static int UFDBinsertURL(
1312    struct UFDBgentableNode * node,
1313    UFDBrevURL *              revURL,
1314    char *                    origURL,
1315    UFDBurlType               type )
1316 {
1317    /*
1318     * find the index where our URL has to be inserted before or is equal to
1319     * e.g. the level "net" is either "< nl" or "= net".
1320     */
1321    int  cmp;
1322    unsigned int  sta_i, i, j;
1323    int  b, e;
1324    int  rv;
1325    int  tailInsert;
1326    struct UFDBgentableSTA * sta;
1327 
1328    tailInsert = rv = 0;
1329    cmp = 0;
1330    i = 0;
1331    sta_i = -1;
1332 
1333 newlevel:
1334    DEBUG(( stderr, "newlevel: UFDBinsertURL( %p, %p, %d )  nodetag '%s'  urlpart '%s'\n",
1335            (void*) node, (void*) revURL, type,
1336            node != NULL ? (char *) node->tag : "NULL",
1337            revURL != NULL ? (char*) revURL->part : "NULL"  ));
1338 
1339    if (revURL == NULL)
1340    {
1341       if (node != NULL)
1342       {
1343          DEBUG(( stderr, "   revURL=NULL  node: totalSubNodes=%d, nSTA=%d  tag='%s'\n",
1344 	         node->totalSubNodes, node->nSTA, node->tag ));
1345 	 if (node->totalSubNodes > 0  &&  node->tag[0] != '/')
1346 	 {
1347 	    /* interesting... we are trying to insert "xxx.com" while the tree already
1348 	     * has one or more members with subnodes like sss.xxx.com or xxx.com/foo[?a=1].
1349 	     * Lets optimise this and get rid of the subdomains !
1350 	     */
1351             DEBUG(( stderr, "   inserted URL has subdomains, first subdomain/path is '%s'\n",
1352                     node->stas->subNodes[0].tag ));
1353             if (ufdbGV.debug)
1354                ufdbLogMessage( "inserted URL (%s) part has subnodes, first %s is '%s'  urlpart NULL  "
1355                                "nodetag '%s'  removing subnodes",
1356                                origURL,
1357                                type==UFDBdomain ? "subdomain" : "path/parameter",
1358                                node->stas->subNodes[0].tag, node->tag );
1359 	    if (!skipOptimisations)
1360 	    {
1361                DEBUG(( stderr, "   removing subdomains of node '%s'\n", node->tag ));
1362 	       rv = 1;
1363 	       node->totalSubNodes = 0;
1364 	       node->nSTA = 0;
1365 	       free( node->stas );		/* TO-DO: should free() a tree ! */
1366 	       node->stas = NULL;
1367 	    }
1368 	 }
1369       }
1370       else
1371       {
1372          DEBUG(( stderr, "        revURL=NULL node=NULL\n" ));
1373       }
1374       return rv;
1375    }
1376 
1377    /* there is a 2-level insert:
1378     * first find the appropriate STA for the insert and then insert the revURL into nodes[].
1379     * If the STA is full, split it into 2 STAs.
1380     *
1381     * If the input file is already (mostly) sorted, almost all insertions take place at the end.
1382     * So lets optimise this by first looking at the end before doing a binary search.
1383     *
1384     * We may have to delete URLs if we are inserting a URL with a shorter path and
1385     * URLs with a longer path (and same path as the one being inserted) exists.
1386     */
1387 
1388    if (node->nSTA == 0)		/* the very first entry at this level */
1389    {
1390       DEBUG(( stderr, "   UFDBinsertURL nSTA=0 creating first STA for node '%s' and subNode '%s'\n",
1391 	      node->tag, (revURL==NULL ? (unsigned char *)"NULL" : revURL->part) ));
1392 
1393       numNodes++;
1394       node->totalSubNodes = 1;
1395       node->nSTA = 1;
1396       node->stas = (struct UFDBgentableSTA *) malloc( 1 * sizeof(struct UFDBgentableSTA) );
1397       sta = node->stas;
1398       sta->nSubNodes = 1;
1399       sta->nodeArraySize = MinNodesPerSTA;
1400       sta->subNodes = (struct UFDBgentableNode *) malloc( MinNodesPerSTA * sizeof(struct UFDBgentableNode) );
1401       sta->subNodes->totalSubNodes = 0;
1402       sta->subNodes->nSTA = 0;
1403       sta->subNodes->stas = NULL;
1404       sta->subNodes->tag = (unsigned char *) _STRDUP( (char *) revURL->part );
1405 
1406       node = &( sta->subNodes[0] );
1407       revURL = revURL->next;
1408       DEBUG(( stderr, "   UFDBinsertURL going to newlevel\n" ));
1409       goto newlevel;
1410    }
1411 
1412    /* check for tail insert: compare with last element of last STA */
1413    sta_i = node->nSTA - 1;
1414    sta = &( node->stas[sta_i] );
1415    i = sta->nSubNodes - 1;
1416    DEBUG(( stderr, "   going to insert '%s' and last node of the last STA(0x%08lx:%d:%d) is '%s'\n",
1417            (char *) revURL->part, (long) sta, sta_i, sta->nSubNodes, (char *) sta->subNodes[i].tag ));
1418 
1419    cmp = strcmpURLpart( (char*) revURL->part, (char*) sta->subNodes[i].tag );
1420    DEBUG(( stderr, "      strcmpURLpart( %s, %s ) = %d\n", (char*) revURL->part,
1421                    (char*) sta->subNodes[i].tag, cmp ));
1422    if (cmp == 0)
1423    {
1424       if (revURL->part[0] == '/'  &&  sta->subNodes[i].totalSubNodes > 0)
1425       {
1426          cmp = strcmp( (char*) revURL->part, (char*) sta->subNodes[i].tag );
1427          DEBUG(( stderr, "      nodetag==urlpart and PATH and node tag has children.  new cmp %d\n", cmp ));
1428       }
1429       else
1430       {
1431          /* node tag matches url tag; there will be no insert at this level */
1432          DEBUG(( stderr, "      nodetag==urlpart: do not insert; goto node_match\n" ));
1433          goto node_match;
1434       }
1435    }
1436    if (cmp > 0)			/* it is a tail insert; we are done with the search for STA and STA index */
1437    {
1438       tailInsert = 1;
1439       goto sta_found;
1440    }
1441 
1442    /* there is no tail insert, so start with finding the right STA */
1443    e = node->nSTA - 1;
1444    if (e == 0)
1445       goto sta_found;	/* there is only one STA; sta and sta_i are already set */
1446    b = 0;
1447    while (b <= e)
1448    {
1449       sta_i = (b + e) / 2;
1450       sta = &( node->stas[sta_i] );
1451       i = 0;
1452       cmp = strcmpURLpart( (char *) revURL->part, (char *) sta->subNodes[i].tag );
1453       if (cmp < 0)
1454 	 e = sta_i - 1;
1455       else if (cmp == 0)
1456          goto node_match;
1457       else /* cmp>0 */
1458       {
1459 	 i = sta->nSubNodes - 1;
1460 	 cmp = strcmpURLpart( (char *) revURL->part, (char *) sta->subNodes[i].tag );
1461 	 if (cmp == 0)
1462 	    goto node_match;
1463 	 if (cmp < 0)
1464 	    break;
1465 	 b = sta_i + 1;
1466       }
1467    }
1468 sta_found:
1469    DEBUG(( stderr, "      inserting in STA sta_i=%d sta=0x%08lx\n", sta_i, (long) sta ));
1470 
1471    /* The STA is found, check if an enlargement or a split is necessary */
1472    if (sta->nSubNodes == MaxNodesPerSTA)
1473    {
1474       struct UFDBgentableSTA * newsta;
1475 
1476       /* split an STA into two STAs with both MaxNodesPerSTA subNodes.
1477        * The existing subNodes are divided 50/50 amongst the current STA and the new STA
1478        * EXCEPT when the current STA is the last STA due to very frequent tail insertion.
1479        */
1480       newsta = (struct UFDBgentableSTA *) malloc( sizeof(struct UFDBgentableSTA) );
1481       newsta->nodeArraySize = MaxNodesPerSTA;
1482       newsta->subNodes = (struct UFDBgentableNode *) malloc( MaxNodesPerSTA * sizeof(struct UFDBgentableNode) );
1483       if (sta_i == node->nSTA - 1)		/* is the STA the last one of this node ? */
1484       {
1485 	 DEBUG(( stderr, "++ splitting last STA sta_i=%d\n", sta_i ));
1486 	 /* only transfer one subNode to the new STA */
1487 	 newsta->nSubNodes = 1;
1488 	 newsta->subNodes[0] = sta->subNodes[MaxNodesPerSTA-1];
1489 	 sta->nSubNodes--;
1490 	 /* insert the new STA in the node (at the end) */
1491 	 node->nSTA++;
1492 	 node->stas = (struct UFDBgentableSTA *)
1493                          realloc( node->stas, node->nSTA * sizeof(struct UFDBgentableSTA) );
1494 	 node->stas[node->nSTA-1] = *newsta;
1495 	 /* must reassign sta because node->stas is realloced */
1496 	 sta = &node->stas[sta_i];
1497 	 newsta = &node->stas[node->nSTA-1];
1498       }
1499       else					/* not the last STA of this node */
1500       {
1501 	 DEBUG(( stderr, "++ splitting STA sta_i=%d\n", sta_i ));
1502 	 /* divide subNodes 50/50 */
1503 	 newsta->nSubNodes = MaxNodesPerSTA/2;
1504 	 for (i = 0; i < MaxNodesPerSTA/2; i++)
1505 	 {
1506 	    newsta->subNodes[i] = sta->subNodes[i + MaxNodesPerSTA/2];
1507 	 }
1508 	 sta->nSubNodes -= MaxNodesPerSTA/2;
1509 	 /* insert the new STA in the node (not at the end) */
1510 	 node->nSTA++;
1511 	 node->stas = (struct UFDBgentableSTA *)
1512                          realloc( node->stas, node->nSTA * sizeof(struct UFDBgentableSTA) );
1513 	 for (j = node->nSTA-1; j > sta_i+1; j--)
1514 	    node->stas[j] = node->stas[j-1];
1515 	 node->stas[sta_i+1] = *newsta;
1516 	 /* must reassign sta because node->stas is realloced */
1517 	 sta = &node->stas[sta_i];
1518 	 newsta = &node->stas[sta_i+1];
1519       }
1520 
1521       /* determine if sta has to point to the new STA */
1522       i = 0;
1523       cmp = strcmpURLpart( (char *) revURL->part, (char *) newsta->subNodes[i].tag );
1524       if (cmp >= 0)
1525          sta = newsta;
1526       if (cmp == 0)
1527          goto node_match;
1528    }
1529    else if (sta->nSubNodes >= sta->nodeArraySize)
1530    {
1531       /* extend STA */
1532       unsigned int  newSize;
1533       if (sta->nodeArraySize == 0)
1534          newSize = 1;
1535       else if (sta->nodeArraySize == 1)
1536          newSize = 8;
1537       else if (sta->nodeArraySize == 8)
1538          newSize = 256;
1539       else
1540          newSize = MaxNodesPerSTA;
1541       DEBUG(( stderr, "++ sta->nSubNodes is %d, extending subNodes array from %d to %d elements\n",
1542               sta->nSubNodes, sta->nodeArraySize, newSize ));
1543       sta->nodeArraySize = newSize;
1544       sta->subNodes = (struct UFDBgentableNode *)
1545                          realloc( sta->subNodes, newSize * sizeof(struct UFDBgentableNode) );
1546    }
1547 
1548    /* the STA is found, subNodes is guaranteed to have space for one more element; search now inside this STA */
1549    if (tailInsert)
1550    {
1551       i = sta->nSubNodes - 1;
1552       cmp = 1;
1553    }
1554    else
1555    {
1556       b = 0;
1557       e = sta->nSubNodes - 1;
1558       DEBUG(( stderr, "         starting bsearch in STA:  b 0  e %d\n", e ));
1559       while (b <= e)			/* binary search STA */
1560       {
1561 	 i = (b + e) / 2;
1562 	 cmp = strcmpURLpart( (char *) revURL->part, (char *) sta->subNodes[i].tag );
1563 	 if (cmp < 0)
1564 	    e = i - 1;
1565 	 else if (cmp > 0)
1566 	    b = i + 1;
1567 	 else
1568 	    goto node_match;
1569       }
1570       DEBUG(( stderr, "         NOTFOUND after bsearch in STA:  part '%s'  cmp %d  i %d  b %d  e %d  "
1571                       "totalSubNodes %d\n",
1572 	      (revURL==NULL ? (unsigned char *)"NULL" : revURL->part), cmp, i, b, e, node->totalSubNodes ));
1573    }
1574 
1575    if (cmp < 0  &&  revURL != NULL  &&  revURL->part[0] == '/')
1576    {
1577       size_t partlen = strlen( (char*) revURL->part );
1578 
1579       if (strncmp( (char*) revURL->part, (char*) sta->subNodes[i].tag, partlen ) == 0  &&  revURL->next == NULL)
1580       {
1581          if (ufdbGV.debug)
1582             ufdbLogMessage( "inserted URL (%s) part has subnodes, first %s is '%s'  part NULL  nodetag '%s'",
1583                             origURL,
1584                             type==UFDBdomain ? "subdomain" : "path/parameter",
1585                             node->stas->subNodes[0].tag, node->tag );
1586          if (doWarnings)
1587             ufdbLogMessage( "URL with longer path is already in the table:\n"
1588                             "   inserting URL %s\n"
1589                             "   removing URL with path %s",
1590                             origURL, sta->subNodes[i].tag );
1591          node = &sta->subNodes[i];
1592          node->tag[partlen] = '\0';     /* truncate the longer path effectively inserting the shorter path */
1593          if (node->totalSubNodes > 0)
1594          {
1595             /* the longer path most likely has no children but remove them if they exist */
1596             node->totalSubNodes = 0;
1597             node->nSTA = 0;
1598             free( node->stas );		/* TO-DO: should free() a tree ! */
1599             node->stas = NULL;
1600          }
1601 
1602          /* remove other longer paths */
1603          i++;
1604          while (i < sta->nSubNodes  &&
1605                 strncmp( (char*) revURL->part, (char*) sta->subNodes[i].tag, partlen ) == 0)
1606          {
1607             if (doWarnings)
1608                ufdbLogMessage( "   also removing URL with path %s", sta->subNodes[i].tag );
1609             for (j = i + 1;  j < sta->nSubNodes;  j++)
1610                sta->subNodes[j-1] = sta->subNodes[j];
1611             sta->nSubNodes--;
1612          }
1613 
1614          /* XXX TODO: remove longer paths in the next STA */
1615 
1616          /* abort the insertion */
1617          return 1;
1618       }
1619    }
1620 
1621    node->totalSubNodes++;
1622    DEBUG(( stderr, "         totalsubnodes of \"%s\" incremented to %d\n", node->tag, node->totalSubNodes ));
1623    numNodes++;
1624 
1625    /* implemented optimisations:
1626     * do not add subdom.abc.com/aurl if abc.com is already in the tree
1627     * do not add subdom.abc.com if abc.com is already in the tree
1628     * remove subdom.abc.com from tree if abc.com is being inserted
1629     */
1630 
1631 doinsert:
1632    if (cmp > 0)					/* this entry > subNodes[i] */
1633    {
1634       DEBUG(( stderr, "         cmp>0 after bsearch: urlpart '%s'  nodetag '%s'  i %d  nSubNodes %d  shifting %d\n",
1635 	      (revURL==NULL ? (unsigned char *)"NULL" : revURL->part),
1636               sta->subNodes[i].tag, i, sta->nSubNodes, sta->nSubNodes - 1 - i ));
1637       DEBUG(( stderr, "         insert '%s' after '%s'\n", revURL->part, sta->subNodes[i].tag ));
1638 
1639       i++;
1640 
1641       sta->nSubNodes++;
1642 
1643       /* make space in the array */
1644       for (j = sta->nSubNodes - 1;  j > i;  j--)
1645 	 sta->subNodes[j] = sta->subNodes[j-1];
1646 
1647       /* insert the current revURL into the array */
1648       sta->subNodes[i].totalSubNodes = 0;
1649       sta->subNodes[i].nSTA = 0;
1650       sta->subNodes[i].stas = NULL;
1651       sta->subNodes[i].tag = (unsigned char *) _STRDUP( (char *) revURL->part );
1652 
1653       /* process the tail of revURL */
1654       node = &( sta->subNodes[i] );
1655       revURL = revURL->next;
1656       goto newlevel;
1657    }
1658    else if (cmp < 0)				/* this entry < subNodes[i] */
1659    {
1660       DEBUG(( stderr, "      cmp<0 after bsearch:  urlpart '%s'  nodetag '%s'  i %d  nSubNodes %d  shifting %d\n",
1661 	      (revURL==NULL ? (unsigned char *)"NULL" : revURL->part),
1662               sta->subNodes[i].tag, i, sta->nSubNodes, sta->nSubNodes - 1 - i ));
1663       DEBUG(( stderr, "         insert '%s' before '%s'\n", revURL->part, sta->subNodes[i].tag ));
1664 
1665       sta->nSubNodes++;
1666 
1667       /* make space in the array */
1668       for (j = sta->nSubNodes - 1;  j > i;  j--)
1669 	 sta->subNodes[j] = sta->subNodes[j-1];
1670 
1671       /* insert the current revURL into the array */
1672       sta->subNodes[i].totalSubNodes = 0;
1673       sta->subNodes[i].nSTA = 0;
1674       sta->subNodes[i].stas = NULL;
1675       sta->subNodes[i].tag = (unsigned char *) _STRDUP( (char *) revURL->part );
1676 
1677       /* process the tail of revURL */
1678       node = &( sta->subNodes[i] );
1679       revURL = revURL->next;
1680       goto newlevel;
1681    }
1682    else if (cmp == 0)				/* an exact match at this level */
1683    {
1684 node_match:
1685       DEBUG(( stderr, "         cmp==0 after bsearch:  urlpart '%s'  nodetag '%s'  node.totalSubNodes %u  "
1686                       "sta->nSubNodes %u  i %d\n",
1687 	      revURL->part, sta->subNodes[i].tag, sta->subNodes[i].totalSubNodes, sta->nSubNodes, i ));
1688 
1689       // we must deal with an exception here:  if the table has example.com/foobar and we are
1690       // inserting example.com/foo?p=v then cmp==0 is wrong and we must correct it here.
1691       if (revURL->part[0] == '/'  &&  revURL->next != NULL)
1692       {
1693          int cmp2 = strcmp( (char*) revURL->part, (char*) sta->subNodes[i].tag );
1694          if (cmp2 != 0)
1695          {
1696             DEBUG(( stderr, "         correction for path %s with parameters: cmp must be %d\n",
1697                     sta->subNodes[i].tag, cmp2 ));
1698             cmp = cmp2;
1699             goto doinsert;
1700          }
1701       }
1702 
1703       /* do not add revURL example.com/foo if example.com is in the table */
1704       if (type == UFDBurl)
1705       {
1706          if (skipOptimisations  ||  sta->subNodes[i].totalSubNodes != 0)
1707 	 {
1708 	    node = &( sta->subNodes[i] );
1709 	    revURL = revURL->next;
1710 	    goto newlevel;
1711 	 }
1712       }
1713       else
1714       {
1715 	 node = &( sta->subNodes[i] );
1716 	 revURL = revURL->next;
1717 	 DEBUG(( stderr, "               node match: going to newlevel\n" ));
1718 	 goto newlevel;
1719       }
1720    }
1721 
1722    return rv;
1723 }
1724 
1725 #if UFDB_OVERRIDE_GCC_OPT && ((__GNUC__ > 4)  ||  (__GNUC__ == 4  &&  __GNUC_MINOR__ >= 4))
1726 #pragma GCC pop_options
1727 #endif
1728 
1729 
1730 /* generate a binary table file, database table format 2.0
1731  */
createMemTable_2_0(struct UFDBgentableNode * node)1732 void createMemTable_2_0( struct UFDBgentableNode * node )
1733 {
1734    unsigned int i, j;
1735    struct UFDBgentableSTA * sta;
1736 
1737    DEBUG(( stderr, "tag=%s  totalSubNodes=%d #STAs=%d\n", node->tag, node->totalSubNodes, node->nSTA ));
1738    mem_puts( node->tag );
1739 
1740    if (node->totalSubNodes > 0)
1741    {
1742       DEBUG(( stderr, "sublevel " ));
1743       mem_putc( UFDBsubLevel );
1744 
1745       /* write the number of subnodes in a 1-byte or 4-byte code */
1746       if (node->totalSubNodes <= 255)
1747       {
1748 	 DEBUG(( stderr, "#1byte-subNodes=%d\n", node->totalSubNodes ));
1749          mem_putc( node->totalSubNodes );
1750 	 numSub8plus++;
1751       }
1752       else
1753       {
1754 	 DEBUG(( stderr, "#4byte-subNodes=%d\n", node->totalSubNodes ));
1755          mem_putc( 0 );
1756          i = node->totalSubNodes;
1757 	 if (i >= 256*256)
1758 	    numSub64Kplus++;
1759 	 else
1760 	    numSub255plus++;
1761          mem_putc( i % 256 );
1762 	 i = i / 256;
1763          mem_putc( i % 256 );
1764 	 i = i / 256;
1765          mem_putc( i % 256 );
1766 	 if (i > 32 && doWarnings)
1767 	    fprintf( stderr, "Warning: LARGE number of subnodes: %d for tag %s\n", node->totalSubNodes, node->tag );
1768       }
1769       DEBUG(( stderr, "      tag = %-18s sub-level   %d subnode(s)\n", node->tag, node->totalSubNodes ));
1770       if (ufdbGV.debug > 1)
1771       {
1772          ufdbLogMessage( "node '%s' has %d subnodes in %d STAs", node->tag, node->totalSubNodes, node->nSTA );
1773 	 if (node->nSTA > 1)
1774 	    for (i = 0; i < node->nSTA; i++)
1775 	       ufdbLogMessage( "   STA %d has %d subnodes", i, node->stas[i].nSubNodes );
1776       }
1777    }
1778    else
1779    {
1780       numLeafNodes++;
1781       DEBUG(( stderr, "      tag = %-18s leaf (no-next-level)\n", node->tag ));
1782    }
1783 
1784    for (i = 0; i < node->nSTA; i++)
1785    {
1786       sta = &node->stas[i];
1787       for (j = 0; j < sta->nSubNodes; j++)
1788       {
1789 	 DEBUG(( stderr, "recursive-call STA %d subnode %d\n", i, j ));
1790 	 createMemTable_2_0( &sta->subNodes[j] );
1791 
1792 	 if (sta->subNodes[j].totalSubNodes == 0)
1793 	 {
1794 	    if (i != node->nSTA - 1  ||  j != sta->nSubNodes - 1)
1795 	    {
1796 	       DEBUG(( stderr, "samelevel\n" ));
1797 	       mem_putc( UFDBsameLevel );
1798 	    }
1799 	 }
1800 	 else
1801 	 {
1802 	    DEBUG(( stderr, "prevlevel\n" ));
1803 	    mem_putc( UFDBprevLevel );
1804 	 }
1805       }
1806    }
1807 }
1808 
1809 
1810 /* generate a binary table file, database table format 2.1
1811  */
1812 UFDB_GCC_HOT
createMemTable_2_1(struct UFDBgentableNode * node)1813 void createMemTable_2_1( struct UFDBgentableNode * node )
1814 {
1815    unsigned int i, j;
1816    static int   indent = 0;
1817    struct UFDBgentableSTA * sta;
1818 
1819    if (doPadding)
1820       mem_pad();
1821    mem_puts( node->tag );
1822 
1823    if (node->totalSubNodes == 1)
1824    {
1825       mem_putc( UFDBsubLevel1 );
1826       numSub1++;
1827       DEBUG(( stderr, "%*s      tag = %-18s sub-level   1 subnode\n", indent, "", node->tag ));
1828    }
1829    else if (node->totalSubNodes == 2)
1830    {
1831       mem_putc( UFDBsubLevel2 );
1832       numSub2++;
1833       DEBUG(( stderr, "%*s      tag = %-18s sub-level   2 subnodes\n", indent, "", node->tag ));
1834    }
1835    else if (node->totalSubNodes == 3)
1836    {
1837       mem_putc( UFDBsubLevel3 );
1838       numSub3++;
1839       DEBUG(( stderr, "%*s      tag = %-18s sub-level   3 subnodes\n", indent, "", node->tag ));
1840    }
1841    else if (node->totalSubNodes == 4)
1842    {
1843       mem_putc( UFDBsubLevel4 );
1844       numSub4++;
1845       DEBUG(( stderr, "%*s      tag = %-18s sub-level   4 subnodes\n", indent, "", node->tag ));
1846    }
1847    else if (node->totalSubNodes == 5)
1848    {
1849       mem_putc( UFDBsubLevel5 );
1850       numSub5++;
1851       DEBUG(( stderr, "%*s      tag = %-18s sub-level   5 subnodes\n", indent, "", node->tag ));
1852    }
1853    else if (node->totalSubNodes == 6)
1854    {
1855       mem_putc( UFDBsubLevel6 );
1856       numSub6++;
1857       DEBUG(( stderr, "%*s      tag = %-18s sub-level   6 subnodes\n", indent, "", node->tag ));
1858    }
1859    else if (node->totalSubNodes == 7)
1860    {
1861       mem_putc( UFDBsubLevel7 );
1862       numSub7++;
1863       DEBUG(( stderr, "%*s      tag = %-18s sub-level   7 subnodes\n", indent, "", node->tag ));
1864    }
1865    else if (node->totalSubNodes > 0)
1866    {
1867       /* write the number of subnodes in a 2-byte, 3-byte or 5-byte code */
1868       if (node->totalSubNodes < 256)
1869       {
1870 	 mem_putc( UFDBsubLevel );
1871          mem_putc( node->totalSubNodes );		/* between 8 and 255 */
1872          if (node->totalSubNodes == 8)
1873             numSub8++;
1874          else
1875             numSub8plus++;
1876       }
1877       else if (node->totalSubNodes < 256*256)
1878       {
1879 	 mem_putc( UFDBsubLevelNNN );			/* more than 255 and less than 65536 */
1880          i = node->totalSubNodes;
1881          mem_putc( i % 256 );
1882 	 i = i / 256;
1883          mem_putc( i % 256 );
1884 	 numSub255plus++;
1885       }
1886       else
1887       {
1888 	 mem_putc( UFDBsubLevelNNNNN );			/* more than 65535 */
1889          i = node->totalSubNodes;
1890 	 if (doWarnings  &&  i > 15000000)
1891 	    fprintf( stderr, "LARGE number of subnodes: %d for node %s  *****\n", i, node->tag );
1892          mem_putc( i % 256 );
1893 	 i = i / 256;
1894          mem_putc( i % 256 );
1895 	 i = i / 256;
1896          mem_putc( i % 256 );
1897 	 i = i / 256;
1898          mem_putc( i % 256 );
1899 	 numSub64Kplus++;
1900       }
1901       DEBUG(( stderr, "%*s      tag = %-18s sub-level   %d subnodes\n", indent, "", node->tag, node->totalSubNodes ));
1902    }
1903    else
1904    {
1905       numLeafNodes++;
1906       DEBUG(( stderr, "%*s      tag = %-18s leaf node\n", indent, "", node->tag ));
1907    }
1908 
1909    for (i = 0; i < node->nSTA; i++)
1910    {
1911       sta = &node->stas[i];
1912       for (j = 0; j < sta->nSubNodes; j++)
1913       {
1914 	 indent += 3;
1915 	 createMemTable_2_1( &(sta->subNodes[j]) );
1916 	 indent -= 3;
1917 
1918 	 if (sta->subNodes[j].totalSubNodes == 0)
1919 	 {
1920 	    if (i != node->nSTA - 1  ||  j != sta->nSubNodes - 1)
1921 	       mem_putc( UFDBsameLevel );
1922 	 }
1923 	 else
1924 	    mem_putc( UFDBprevLevel );
1925       }
1926    }
1927 }
1928 
1929 
1930 /* generate a binary table file, database table format 2.2
1931  */
1932 UFDB_GCC_HOT
createMemTable_2_2(struct UFDBgentableNode * node)1933 void createMemTable_2_2( struct UFDBgentableNode * node )
1934 {
1935    unsigned int i, j;
1936    static int   indent = 0;
1937    struct UFDBgentableSTA * sta;
1938 
1939    if (doPadding)
1940       mem_pad();
1941    mem_putsnt( node->tag );
1942 
1943    if (node->totalSubNodes == 1)
1944    {
1945       mem_putc( UFDBsubLevel1 );
1946       numSub1++;
1947       DEBUG(( stderr, "%*s      tag = %-18s sub-level   1 subnode\n", indent, "", node->tag ));
1948    }
1949    else if (node->totalSubNodes == 2)
1950    {
1951       mem_putc( UFDBsubLevel2 );
1952       numSub2++;
1953       DEBUG(( stderr, "%*s      tag = %-18s sub-level   2 subnodes\n", indent, "", node->tag ));
1954    }
1955    else if (node->totalSubNodes == 3)
1956    {
1957       mem_putc( UFDBsubLevel3 );
1958       numSub3++;
1959       DEBUG(( stderr, "%*s      tag = %-18s sub-level   3 subnodes\n", indent, "", node->tag ));
1960    }
1961    else if (node->totalSubNodes == 4)
1962    {
1963       mem_putc( UFDBsubLevel4 );
1964       numSub4++;
1965       DEBUG(( stderr, "%*s      tag = %-18s sub-level   4 subnodes\n", indent, "", node->tag ));
1966    }
1967    else if (node->totalSubNodes == 5)
1968    {
1969       mem_putc( UFDBsubLevel5 );
1970       numSub5++;
1971       DEBUG(( stderr, "%*s      tag = %-18s sub-level   5 subnodes\n", indent, "", node->tag ));
1972    }
1973    else if (node->totalSubNodes == 6)
1974    {
1975       mem_putc( UFDBsubLevel6 );
1976       numSub6++;
1977       DEBUG(( stderr, "%*s      tag = %-18s sub-level   6 subnodes\n", indent, "", node->tag ));
1978    }
1979    else if (node->totalSubNodes == 7)
1980    {
1981       mem_putc( UFDBsubLevel7 );
1982       numSub7++;
1983       DEBUG(( stderr, "%*s      tag = %-18s sub-level   7 subnodes\n", indent, "", node->tag ));
1984    }
1985    else if (node->totalSubNodes > 0)
1986    {
1987       /* write the number of subnodes in a 2-byte, 3-byte or 5-byte code */
1988       if (node->totalSubNodes < 256)
1989       {
1990 	 mem_putc( UFDBsubLevel );
1991          mem_putc( node->totalSubNodes );		/* between 8 and 255 */
1992          if (node->totalSubNodes == 8)
1993             numSub8++;
1994          else
1995             numSub8plus++;
1996       }
1997       else if (node->totalSubNodes < 256*256)
1998       {
1999 	 mem_putc( UFDBsubLevelNNN );		/* more than 255 and less than 65536 */
2000          i = node->totalSubNodes;
2001          mem_putc( i % 256 );
2002 	 i = i / 256;
2003          mem_putc( i % 256 );
2004 	 numSub255plus++;
2005       }
2006       else
2007       {
2008 	 mem_putc( UFDBsubLevelNNNNN );		/* more than 65535 */
2009          i = node->totalSubNodes;
2010 	 if (doWarnings  &&  i > 15000000)
2011 	    fprintf( stderr, "LARGE number of subnodes: %d for node %s  *****\n", i, node->tag );
2012          mem_putc( i % 256 );
2013 	 i = i / 256;
2014          mem_putc( i % 256 );
2015 	 i = i / 256;
2016          mem_putc( i % 256 );
2017 	 i = i / 256;
2018          mem_putc( i % 256 );
2019 	 numSub64Kplus++;
2020       }
2021       DEBUG(( stderr, "%*s      tag = %-18s sub-level   %d subnodes\n", indent, "",
2022               node->tag, node->totalSubNodes ));
2023    }
2024    else
2025    {
2026       /* node->totalSubNodes == 0 */
2027       numLeafNodes++;
2028       DEBUG(( stderr, "%*s      tag = %-18s leaf node\n", indent, "", node->tag ));
2029    }
2030 
2031    for (i = 0; i < node->nSTA; i++)
2032    {
2033       sta = &node->stas[i];
2034       for (j = 0; j < sta->nSubNodes; j++)
2035       {
2036 	 indent += 3;
2037 	 createMemTable_2_2( &(sta->subNodes[j]) );
2038 	 indent -= 3;
2039 
2040 	 if (sta->subNodes[j].totalSubNodes == 0)
2041 	 {
2042 	    if (i != node->nSTA - 1  ||  j != sta->nSubNodes - 1)
2043 	       mem_putc( UFDBsameLevel );
2044 	 }
2045 	 else
2046 	    mem_putc( UFDBprevLevel );
2047       }
2048    }
2049 }
2050 
2051 
2052 /* generate a binary table file, database table format 3.0
2053  */
2054 #if UFDB_DBFORMAT_3
2055 #include "genTable3.c"
2056 #endif
2057 
2058 
2059 /* need a forward declaration since addDomain() and addOtherYoutubeURLs() call eachother */
2060 UFDB_GCC_HOT UFDB_GCC_INLINE
2061 static void addDomain(
2062    UFDBthreadAdmin * admin,
2063    char *   	     domain,
2064    UFDBurlType       type );
2065 
2066 
2067 UFDB_GCC_HOT
addOtherYoutubeURLs(UFDBthreadAdmin * admin,char * id)2068 static void addOtherYoutubeURLs(
2069    UFDBthreadAdmin * admin,
2070    char *            id )
2071 {
2072    char ** u;
2073    char    tmpURL[128];
2074    char *  otherURLs[] = {
2075       "m.youtube.com/watch?v=%s",
2076       "youtube.com/embed/%s",
2077       "youtube-nocookie.com/embed/%s",
2078       "youtube.com/get_video_info?video_id=%s",
2079       "youtube.com/get_video_info?content_v=%s",
2080       "youtube-nocookie.com/get_video_info?video_id=%s",
2081       "youtube-nocookie.com/get_video_info?content_v=%s",
2082       "ytimg.googleusercontent.com/vi/%s/",
2083       "i.ytimg.com/vi/%s/",
2084       "i1.ytimg.com/vi/%s/",
2085       "i2.ytimg.com/vi/%s/",
2086       "i3.ytimg.com/vi/%s/",
2087       "i4.ytimg.com/vi/%s/",
2088       "i5.ytimg.com/vi/%s/",
2089       "i6.ytimg.com/vi/%s/",
2090       "i7.ytimg.com/vi/%s/",
2091       "i8.ytimg.com/vi/%s/",
2092       "i9.ytimg.com/vi/%s/",
2093       "i.ytimg.com/vi_webp/%s/",
2094       "i1.ytimg.com/vi_webp/%s/",
2095       "i2.ytimg.com/vi_webp/%s/",
2096       "i3.ytimg.com/vi_webp/%s/",
2097       "i4.ytimg.com/vi_webp/%s/",
2098       "i5.ytimg.com/vi_webp/%s/",
2099       "i6.ytimg.com/vi_webp/%s/",
2100       "i7.ytimg.com/vi_webp/%s/",
2101       "i8.ytimg.com/vi_webp/%s/",
2102       "i9.ytimg.com/vi_webp/%s/",
2103       "i.ytimg.com/sb/%s/",
2104       "i1.ytimg.com/sb/%s/",
2105       "i2.ytimg.com/sb/%s/",
2106       "i3.ytimg.com/sb/%s/",
2107       "i4.ytimg.com/sb/%s/",
2108       "i5.ytimg.com/sb/%s/",
2109       "i6.ytimg.com/sb/%s/",
2110       "i7.ytimg.com/sb/%s/",
2111       "i8.ytimg.com/sb/%s/",
2112       "i9.ytimg.com/sb/%s/",
2113       NULL
2114    };
2115 
2116    for (u = otherURLs;  *u != NULL;  u++)
2117    {
2118       sprintf( tmpURL, *u, id );
2119       addDomain( admin, tmpURL, UFDBurl );
2120    }
2121 }
2122 
2123 
2124 UFDB_GCC_HOT
addDomain(UFDBthreadAdmin * admin,char * domain,UFDBurlType type)2125 static void addDomain(
2126    UFDBthreadAdmin * admin,
2127    char *   	     domain,
2128    UFDBurlType       type )
2129 {
2130    char *            t;
2131    UFDBrevURL *      revUrl;
2132    int               rv;
2133    int               portnumber;
2134    char              protocol[16];
2135    char              strippedURL[UFDB_MAX_URL_LENGTH];
2136    char              strippedDomain[1024];
2137 
2138    /* strip starting and trailing whitespace */
2139    while (*domain == ' ' || *domain == '\t')
2140       domain++;
2141    for (t = domain;  *t != '\0' && *t != '\n';  t++)
2142       ;
2143    t--;
2144    while (t > domain  &&  (*t == ' ' || *t == '\t'))
2145    {
2146       *t = '\0';
2147       t--;
2148    }
2149    /* skip empty lines */
2150    if (*domain == '\0')
2151       return;
2152 
2153    if (ufdbGV.debug > 1)
2154       fprintf( stderr, "addDomain( %s )\n", domain );
2155 
2156    numEntries++;
2157 
2158    UFDBstripURL2( (char *) domain, stripWWW, strippedURL, strippedDomain, protocol, &portnumber );
2159 
2160    /* we do not check illegal domain names, but must check for '..' and dot at the start, because that
2161     * generates zero-length tags and may cause a lot of evil.
2162     */
2163    if (*strippedDomain == '.')
2164    {
2165       ufdbLogError( "domain starts with '.';  bad URL is not added: %s", domain );
2166       return;
2167    }
2168    for (t = strippedDomain;  *t != '\0';  t++)
2169    {
2170       if (*t == '.'  &&  *(t+1) == '.')
2171       {
2172          ufdbLogError( "found '..' in domain;  bad URL is not added: %s", domain );
2173 	 return;
2174       }
2175    }
2176 
2177    if (!utf8support)
2178    {
2179       unsigned char * s;
2180 
2181       for (s = (unsigned char *) strippedURL;  *s != '\0';  s++)
2182       {
2183          if (*s >= 0x80)
2184          {
2185             if (doWarnings)
2186                fprintf( stderr, "warning: must use format 2.2 or later for URL with UTF8 characters.\n"
2187                         "   skipping URL %s\n", domain );
2188             return;
2189          }
2190       }
2191    }
2192 
2193 #if 0
2194    if (ufdbGV.debug)
2195       ufdbLogMessage( "domain: %s\nstrippedurl: %s\nprotocol: %s\nport: %d",
2196                       domain, strippedURL, protocol, portnumber );
2197 #endif
2198 
2199    revUrl = UFDBgenRevURL4table( admin, (unsigned char *) strippedURL );
2200 
2201    /* first do a lookup of the domain, and if it already matches, it should not be added !  */
2202    if (skipOptimisations)
2203       rv = 0;
2204    else
2205       rv = UFDBgentableLookupRevUrl( table, revUrl );
2206    if (rv)
2207    {
2208       if (doWarnings)
2209          ufdbLogMessage( "URL %s is not added because it was already matched by a previous URL", domain );
2210    }
2211    else
2212    {
2213       rv = UFDBinsertURL( table, revUrl, domain, type );
2214       if (rv)
2215       {
2216          if (doWarnings)
2217             ufdbLogMessage( "URL %s has optimised subdomains or paths", domain );
2218       }
2219    }
2220 
2221    UFDBfreeRevURL( admin, revUrl );
2222 
2223    if (strncmp( strippedURL, "youtube.com/watch?v=", 20 ) == 0)
2224    {
2225       char * id;
2226       char * end;
2227       id = strippedURL + 20;
2228       if (*id != '\0'  &&  strlen(id) > 10)
2229       {
2230          /* remove other parameters after the video ID */
2231          end = id;
2232          while (*end != '\0'  &&  *end != '&')
2233             end++;
2234          *end = '\0';
2235          addOtherYoutubeURLs( admin, id );
2236       }
2237    }
2238 }
2239 
2240 
generateRandomKey(char * encryptKey)2241 static void generateRandomKey( char * encryptKey )
2242 {
2243    srandom( (getpid() << 12) + time(NULL) );
2244 
2245    encryptKey[0]  = randomChar();
2246    encryptKey[1]  = randomChar();
2247    encryptKey[2]  = randomChar();
2248    encryptKey[3]  = randomChar();
2249    encryptKey[4]  = randomChar();
2250    encryptKey[5]  = randomChar();
2251    encryptKey[6]  = randomChar();
2252    encryptKey[7]  = randomChar();
2253    encryptKey[8]  = randomChar();
2254    encryptKey[9]  = randomChar();
2255    encryptKey[10] = randomChar();
2256    encryptKey[11] = randomChar();
2257    encryptKey[12] = randomChar();
2258    encryptKey[13] = randomChar();
2259    encryptKey[14] = randomChar();
2260    encryptKey[15] = randomChar();
2261    encryptKey[16] = '\0';
2262 }
2263 
2264 
copyKey(char * key,char * encryptKey)2265 static void copyKey( char * key, char * encryptKey )
2266 {
2267    key[0]  = encryptKey[0];
2268    key[1]  = encryptKey[1];
2269    key[2]  = encryptKey[2];
2270    key[3]  = encryptKey[3];
2271    key[4]  = '-';
2272    key[5]  = encryptKey[4];
2273    key[6]  = encryptKey[5];
2274    key[7]  = encryptKey[6];
2275    key[8]  = encryptKey[7];
2276    key[9]  = '-';
2277    key[10] = encryptKey[8];
2278    key[11] = encryptKey[9];
2279    key[12] = encryptKey[10];
2280    key[13] = encryptKey[11];
2281    key[14] = '-';
2282    key[15] = encryptKey[12];
2283    key[16] = encryptKey[13];
2284    key[17] = encryptKey[14];
2285    key[18] = encryptKey[15];
2286    key[19] = '\0';
2287 }
2288 
2289 
encryptMemory(unsigned char * to,unsigned char * from,long n,unsigned char * key,char * format)2290 static inline void encryptMemory( unsigned char * to, unsigned char * from, long n,
2291                                   unsigned char * key, char * format )
2292 {
2293    ufdbCrypt uc;
2294 
2295 #if UFDB_DO_DEBUG
2296    if (ufdbGV.debug)
2297       fprintf( stderr, "encryptMemory( %p %p %'ld %16.16s )\n",
2298                (void*) to, (void*) from, n, key );
2299 #endif
2300 
2301    ufdbCryptInit( &uc, key, 16, format );
2302    ufdbEncryptText( &uc, to, from, n );
2303 }
2304 
2305 
2306 #if UFDB_BZ2LIB_SUPPORT
BZ2compressMemory(unsigned char * to,unsigned char * from,long size)2307 static inline long BZ2compressMemory( unsigned char * to, unsigned char * from, long size )
2308 {
2309    unsigned int new_size;
2310 
2311    new_size = (unsigned int) (size + 2048);
2312    if (BZ_OK != BZ2_bzBuffToBuffCompress( (char *) to, &new_size, (char *) from, size, 7, 0, 30 ))
2313    {
2314       fprintf( stderr, "compression failed.\n" );
2315       exit( 1 );
2316    }
2317 
2318    if (ufdbGV.debug)
2319       fprintf( stderr, "BZIP2 compression: from size %ld to %d\n", size, new_size );
2320 
2321    return new_size;
2322 }
2323 #endif
2324 
2325 
ZLIBcompressMemory(unsigned char * to,unsigned char * from,long size)2326 inline static long ZLIBcompressMemory( unsigned char * to, unsigned char * from, long size )
2327 {
2328    int       retval;
2329    z_stream  zs;
2330 
2331    zs.zalloc = Z_NULL;
2332    zs.zfree  = Z_NULL;
2333    zs.opaque = Z_NULL;
2334    zs.zalloc = ufdbZlibMalloc;
2335    zs.zfree = ufdbZlibFree;
2336 
2337    zs.next_in  = from;
2338    zs.avail_in = size;
2339 
2340    zs.next_out  = to;
2341    zs.avail_out = size + 2048;
2342 
2343 
2344    retval = deflateInit( &zs, Z_BEST_COMPRESSION );
2345    if (Z_OK != retval)
2346    {
2347       fprintf( stderr, "ZLIB initialisation failed: error %d  *****\n", retval );
2348       exit( 1 );
2349    }
2350    retval = deflate( &zs, Z_FINISH );
2351    if ((Z_STREAM_END != retval && Z_OK != retval)  ||  zs.avail_out == 0)
2352    {
2353       fprintf( stderr, "ZLIB compression failed: error %d  avail_out %u  *****\n", retval, zs.avail_out );
2354       exit( 1 );
2355    }
2356 
2357    if (ufdbGV.debug)
2358       fprintf( stderr, "ZLIB compression: from size %ld to %lu\n", size, zs.total_out );
2359 
2360    size = (long) zs.total_out;
2361    deflateEnd( &zs );
2362    return size;
2363 }
2364 
2365 
doCryptCompress(FILE * f,char * encryptKey,char * format)2366 static void doCryptCompress(
2367    FILE * f,
2368    char * encryptKey,
2369    char * format )
2370 {
2371    long   hdr_size;
2372    long   size;
2373    long   orig_size;
2374    unsigned char * buffer2;
2375 #if 0
2376    int    cksum = 0;
2377 #endif
2378 
2379    /* The table is in mem[]; doCrypt and/or doZLIBcompress or doBZ2compress is 1.
2380     * The result of this function is in mem[] and the size (mem_i) is adjusted.
2381     */
2382 
2383    orig_size = size = mem_i;
2384 
2385    if (ufdbGV.debug)
2386       fprintf( stderr, "doCryptCompress orig_size %'ld bytes  doCrypt=%d  doZLIBcompress=%d  doBZ2compress=%d  "
2387                        "format=%s\n",
2388                        orig_size, doCrypt, doZLIBcompress, doBZ2compress, format );
2389 
2390    buffer2 = (unsigned char *) malloc( size + 2048 );
2391    if (buffer2 == NULL)
2392    {
2393       fprintf( stderr, "cannot allocate memory for encryption and/or compression (size=%'ld)\n", size );
2394       exit( 1 );
2395    }
2396 
2397    /* make sure the 'result' is in buffer2 */
2398 #if UFDB_BZ2LIB_SUPPORT
2399    if (doBZ2compress)
2400    {
2401       size = BZ2compressMemory( buffer2, mem, orig_size );
2402       if (ufdbGV.debug)
2403          fprintf( stderr, "BZIP2 compressed %'ld bytes to %'ld bytes in buffer2\n", orig_size, size );
2404    }
2405    else
2406 #endif
2407    if (doZLIBcompress)
2408    {
2409       size = ZLIBcompressMemory( buffer2, mem, orig_size );
2410       if (ufdbGV.debug)
2411          fprintf( stderr, "ZLIB compressed %'ld bytes to %'ld bytes in buffer2\n", orig_size, size );
2412    }
2413    else
2414    {
2415       memcpy( buffer2, mem, size );
2416       if (ufdbGV.debug)
2417          fprintf( stderr, "copied %'ld bytes to buffer2 (no compression)\n", size );
2418    }
2419 
2420    /* crypt from buffer2 into mem */
2421    if (doCrypt)
2422    {
2423       encryptMemory( mem, buffer2, size, (unsigned char *) encryptKey, format );
2424       if (ufdbGV.debug)
2425          fprintf( stderr, "crypted %'ld bytes from buffer2 to mem\n", size );
2426    }
2427    else
2428    {
2429       memcpy( mem, buffer2, size );
2430       if (ufdbGV.debug)
2431          fprintf( stderr, "copied %'ld bytes from buffer2 to mem\n", size );
2432    }
2433 
2434 #if 0
2435    /* TODO fix the problem with 2.1 cksum */
2436    if (strcmp( format, "2.1" ) >= 0)
2437    {
2438       cksum = UFDBcalcCksum( buffer1, size );
2439    }
2440 #endif
2441 
2442    hdr_size = (doPadding || format[0] >= '3') ?
2443                  sizeof(struct UFDBfileHeader21) : sizeof(struct UFDBfileHeader);
2444    /* write mem to the file */
2445    fseek( f, (long) hdr_size, SEEK_SET );
2446 
2447    if (1 != fwrite( mem, size, 1, f ))
2448    {
2449       fprintf( stderr, "cannot write crypted/compressed table to file: fwrite failed.\n" );
2450       exit( 3 );
2451    }
2452    fflush( f );
2453 
2454    if (ufdbGV.debug)
2455       fprintf( stderr, "%'ld bytes written to file\n", size );
2456 
2457    /* truncate the file (if we did compression) */
2458    if ((doZLIBcompress || doBZ2compress)  &&  size < orig_size)
2459    {
2460       if (ftruncate( fileno(f), size + hdr_size ) < 0)
2461          fprintf( stderr, "failed to truncate compressed file to size %'ld", (long) size + hdr_size );
2462    }
2463 
2464    free( buffer2 );
2465 }
2466 
2467 
2468 #if 0
2469 static void UFDB_GCC_HOT convertSpecialCharacters( unsigned char * domain )
2470 {
2471    unsigned char * s;
2472    unsigned char * d;
2473 
2474    for (s = domain, d = domain;  *s != '\0';  s++)
2475    {
2476       if (*s == '%')
2477       {
2478          unsigned int hex;
2479 	 unsigned int h1, h2;
2480 
2481 	 h1 = *(s+1);
2482 	 h2 = *(s+2);
2483 	 if (isxdigit(h1) && isxdigit(h2))
2484 	 {
2485 	    hex  = (h1 <= '9') ? h1 - '0' : h1 - 'a' + 10;
2486 	    hex *= 16;
2487 	    hex += (h2 <= '9') ? h2 - '0' : h2 - 'a' + 10;
2488 	    if (hex == 0)
2489 	    {
2490 	       s += 2;
2491 	       continue;
2492 	    }
2493 	    else if (hex <= 0x20)
2494 	    {
2495 	       if (hex != '\t'  &&  hex != '\r'  &&  hex != '\n'  &&  hex != '\f')
2496 		  hex = ' ';
2497 	    }
2498 	    else
2499 	    {
2500 	       if (hex == 0x7f  ||  hex == 0xff)
2501 		  hex = ' ';
2502 	       else
2503 		  if (hex <= 'Z'  &&  hex >= 'A')
2504 		     hex += 'a' - 'A';
2505 	    }
2506 	    *d++ = hex;
2507 	    s += 2;
2508 	 }
2509 	 else
2510 	    *d++ = *s;
2511       }
2512       else
2513       {
2514          *d++ = *s;
2515       }
2516    }
2517    *d = '\0';
2518 }
2519 #endif
2520 
2521 
calcpathlen(char * p)2522 static int calcpathlen( char * p )
2523 {
2524    int len = 0;
2525    while (*p != '\0')
2526    {
2527       if (*p == '%'  &&  isxdigit( *(p+1) )  &&  isxdigit( *(p+2) ))
2528         p += 2;
2529       p++;
2530       len++;
2531    }
2532    return len;
2533 }
2534 
2535 
2536 #define MAXPARAMS       200
2537 
sortstrcmp(const void * a,const void * b)2538 static int sortstrcmp( const void * a, const void * b )
2539 {
2540    unsigned char ** ppa;
2541    unsigned char ** ppb;
2542    unsigned char *  pa;
2543    unsigned char *  pb;
2544    int              diff, va, vb;
2545 
2546    ppa = (unsigned char **) a;
2547    ppb = (unsigned char **) b;
2548    pa = *ppa;
2549    pb = *ppb;
2550 
2551    diff = 0;
2552    while (*pa != '\0')
2553    {
2554       va = (int) *pa;
2555       vb = (int) *pb;
2556       if (va == '=')
2557          va = 0;
2558       pa++;
2559       if (vb == '=')
2560          vb = 0;
2561       diff = va - vb;
2562       pb++;
2563       if (va == 0  ||  vb == 0  ||  diff != 0)
2564          return diff;
2565    }
2566 
2567    return (int) *pa - (int) *pb;
2568 }
2569 
2570 
sortURLparams(unsigned char * parlist)2571 static void sortURLparams( unsigned char * parlist )
2572 {
2573    char *          p;
2574    char *          sep;
2575    char *          pp[MAXPARAMS];
2576    unsigned int    i;
2577    unsigned int    n;
2578 
2579    if (strchr( (char*) parlist, '&' ) == NULL)
2580       return;
2581 
2582    p = (char*) parlist;
2583    n = 0;
2584    while ((sep = strchr( p, '&' )) != NULL)
2585    {
2586       *sep = '\0';
2587       pp[n] = ufdbStrdup( p );
2588       p = sep + 1;
2589       n++;
2590       if (n == MAXPARAMS-1)
2591       {
2592          ufdbLogError( "cannot sort parameters of %s", parlist );
2593          for (i = 0;  i < n;  i++)
2594             ufdbFree( pp[i] );
2595          return;
2596       }
2597    }
2598    if (n == 0)
2599       return;
2600    /* assign the last parameter to pp */
2601    pp[n] = ufdbStrdup( p );
2602 
2603    ++n;
2604 
2605    qsort( pp, (size_t) n, (size_t) sizeof(char*), sortstrcmp );
2606 
2607    p = (char*) parlist;
2608    for (i = 0;  i < n;  i++)
2609    {
2610       p += sprintf( p, "%s", pp[i] );
2611       if (i < n-1)
2612          *p++ = '&';
2613       ufdbFree( pp[i] );
2614    }
2615 }
2616 
2617 
2618 UFDB_GCC_HOT
main(int argc,char * argv[])2619 int main( int argc, char * argv[] )
2620 {
2621    int               n;
2622    int               opt;
2623    time_t            now;
2624    struct tm         tm;
2625    unsigned char *   d;
2626    char              encryptKey[16+1];
2627    char              key[16+3+1];
2628    char              flags[8+1];
2629    FILE *            fout;
2630    char *            fout_buffer;
2631    UFDBthreadAdmin * admin;
2632    int		     hdr_size;
2633    struct UFDBfileHeader21 header;
2634    char              date[64];
2635    char              outFileName[512];
2636    char              tempOutFileName[512];
2637    unsigned char     domain[4096];
2638 
2639 #if UFDB_DPDK_SUPPORT
2640    int retval;
2641    char * dummy[] = { "ufdbGenTable", NULL };
2642    retval = rte_eal_init( 0, dummy );
2643    if (retval < 0)
2644       rte_panic( "cannot initialize RTE EAL\n" );
2645 #endif
2646 
2647    UFDBinitializeGV( &ufdbGV );
2648 #if UFDBSS_SQUID
2649    strcpy( ufdbGV.progname, "ufdbGenTable" );
2650 #elif UFDBSS_RESTAPI
2651    strcpy( ufdbGV.progname, "ufdbRESTGenTable" );
2652 #elif UFDBSS_QS
2653    strcpy( ufdbGV.progname, "ufdbqsGenTable" );
2654 #else
2655    strcpy( ufdbGV.progname, "ufdbGenTable" );
2656 #endif
2657    UFDBappInit();
2658    admin = UFDBallocThreadAdmin();
2659    domainsFileName = NULL;
2660    urlsFileName = NULL;
2661    tableName = "defaulttable";
2662    date[0] = flags[0] = key[0] = encryptKey[0] = '\0';
2663 
2664    while ((opt = getopt( argc, argv, "BLDF:k:t:d:u:UNnCqpPsSVW?XZ" )) > 0)
2665    {
2666       switch (opt)
2667       {
2668       case 'B':
2669       case 'L':
2670          fprintf( stderr, "-B and -L options are ignored. The generated table has an L format\n" );
2671          // endian = opt;
2672          break;
2673       case 'D':
2674       	 ufdbGV.debug++;
2675 	 break;
2676       case 'F':
2677          format = optarg;
2678 	 if (!strmatch3( format, "2.0" )  &&
2679 	     !strmatch3( format, "2.1" )  &&
2680 	     !strmatch3( format, "2.2" )
2681 #if UFDB_DBFORMAT_3
2682           && !strmatch3( format, "3.0" )
2683 #endif
2684              )
2685 	 {
2686 #if UFDB_DBFORMAT_3
2687 	    fprintf( stderr, "-F option only accepts 2.0, 2.1, 2.2 and 3.0 as file format specifiers\n" );
2688 #else
2689 	    fprintf( stderr, "-F option only accepts 2.0, 2.1 and 2.2 as file format specifiers\n" );
2690 #endif
2691 	    usage();
2692 	 }
2693 	 if (strmatch3( format, "2.0" )  ||
2694 	     strmatch3( format, "2.1" ))
2695          {
2696             utf8support = 0;
2697          }
2698 	 break;
2699       case 't':
2700          tableName = optarg;
2701 	 break;
2702       case 'd':
2703          domainsFileName = optarg;
2704 	 break;
2705       case 's':
2706       	 doSanityCheck = 1;
2707 	 break;
2708       case 'S':
2709          printStatistics = 1;
2710          break;
2711       case 'u':
2712          urlsFileName = optarg;
2713 	 break;
2714       case 'U':
2715          urlsIncluded = 1;
2716          break;
2717       case 'k':
2718          strncpy( encryptKey, optarg, 16 );
2719 	 encryptKey[16] = '\0';
2720 	 if (strlen( encryptKey ) != 16)
2721 	 {
2722 	    fprintf( stderr, "key \"%s\" is not valid.\n", encryptKey );
2723 	    usage();
2724 	 }
2725 	 break;
2726       case 'n':
2727          doCrypt = 0;
2728 	 break;
2729       case 'N':
2730          utf8support = 0;
2731          break;
2732       case 'p':
2733          doPadding = 1;
2734 	 break;
2735       case 'P':
2736          doProd = 1;
2737 	 break;
2738       case 'C':
2739 #if UFDB_BZ2LIB_SUPPORT
2740          doBZ2compress = 1;
2741 #else
2742          fprintf( stderr, "bzip2 compression is not supported on this platform.\n" );
2743          exit( 4 );
2744 #endif
2745 	 break;
2746       case 'q':
2747          doWarnings = 0;
2748 	 break;
2749       case 'V':
2750          printf( "%s version " UFDB_VERSION "\n", ufdbGV.progname );
2751 	 printf( "Copyright (C) 2005-2020 by URLfilterDB B.V.\n" );
2752 	 exit( 0 );
2753       case 'W':
2754          stripWWW = 1;
2755 	 break;
2756       case 'X':
2757          skipOptimisations = 1;
2758 	 break;
2759       case 'Z':
2760          doZLIBcompress = 1;
2761 	 break;
2762       case '?':
2763 	 fprintf( stderr, "help:\n" );
2764          usage();
2765 	 break;
2766       default:
2767 	 fprintf( stderr, "internal error: getopt returned \"%c\"\n", opt );
2768          usage();
2769 	 break;
2770       }
2771    }
2772 
2773    if (strlen(tableName) > 15)
2774    {
2775       tableName[15] = '\0';
2776       fprintf( stderr, "warning: the tableName is truncated to \"%s\"\n", tableName );
2777    }
2778 
2779    if (domainsFileName == NULL)
2780    {
2781       fprintf( stderr, "the input file name is not specified: use the -d option\n" );
2782       usage();
2783    }
2784 
2785    if (doZLIBcompress && doBZ2compress)
2786    {
2787       fprintf( stderr, "use -C or -Z but not both.\n" );
2788       usage();
2789    }
2790 
2791    fin = fopen( domainsFileName, "r" );
2792    if (fin == NULL)
2793    {
2794       char strbuf[128];
2795       strerror_r( errno, strbuf, sizeof(strbuf) );
2796       fprintf( stderr, "cannot read from \"%s\": %s\n", domainsFileName, strbuf );
2797       usage();
2798    }
2799    if (ufdbGV.debug)
2800       fprintf( stderr, "processing domains from file \"%s\"\n", domainsFileName );
2801 
2802    strcpy( outFileName, domainsFileName );
2803    strcat( outFileName, UFDBfileSuffix );
2804 
2805    strcpy( tempOutFileName, outFileName );
2806    strcat( tempOutFileName, ".temp" );
2807 
2808    fout = fopen( tempOutFileName, "w+" );
2809    if (fout == NULL)
2810    {
2811       char strbuf[128];
2812       strerror_r( errno, strbuf, sizeof(strbuf) );
2813       fprintf( stderr, "cannot write to \"%s\": %s\n", tempOutFileName, strbuf );
2814       usage();
2815    }
2816    if (ufdbGV.debug)
2817       fprintf( stderr, "opened temporary file \"%s\"\n", tempOutFileName );
2818    fout_buffer = (char *) malloc( 64*1024 );
2819    setvbuf( fout, fout_buffer, _IOFBF, 64*1024 );
2820 
2821    /* setlinebuf( stderr ); */
2822    initTable( tableName );
2823 
2824    URLparamSupport = 0;
2825    if ((format[0] == '2' && format[2] >= '2')  ||  format[0] >= '3')
2826       URLparamSupport = 1;
2827 
2828 
2829    /* process the domains ********************************************/
2830    n = 0;
2831 readdomains:
2832    while (!feof(fin))
2833    {
2834       int             last_char;
2835       unsigned char * ptr;
2836 
2837       ptr = domain;
2838 
2839       while ((*ptr = last_char = myfast_getc(fin)) != '\n')
2840       {
2841 	 /* check for a last line without \n */
2842          if (last_char == EOF)
2843 	 {
2844 	    if (ptr != domain)
2845 	       break;
2846 	    goto eof;
2847 	 }
2848 	 if (last_char == '\r')
2849 	    continue;
2850 	 ptr++;
2851 	 if (ptr > &domain[4090])
2852 	 {
2853 	    *ptr = '\0';
2854 	    fprintf( stderr, "line too long: %s\n", domain );
2855 	    while (!feof(fin) && myfast_getc(fin) != '\n')
2856 	       ;
2857 	    goto readdomains;
2858 	 }
2859       }
2860       *ptr = '\0';
2861 
2862       d = domain;
2863       while (*d == ' ')
2864 	 d++;
2865 
2866       if (d[0] != '\0'  &&  d[0] != '#')
2867       {
2868 	 char * first_slash;
2869 
2870          d = skipProtocol( d );
2871          if (d == NULL)
2872             d = domain;
2873 
2874 	 /* TODO: use has_tld(domain+4) */
2875          if (d[0] == 'w' && d[1] == 'w' && d[2] == 'w' && d[3] == '.' && strchr( (char*) d+4, '.' ) != NULL)
2876 	 {
2877 	    if (stripWWW)
2878 	    {
2879 	       if (doWarnings)
2880 		  fprintf( stderr, "notice: \"www.\" is stripped for %s\n", d );
2881 	    }
2882 	    else if (doWarnings)
2883 	    {
2884 	       fprintf( stderr, "warning: domain name starts with \"www.\": %s (use -W option ?)\n", d );
2885 	    }
2886 	 }
2887 
2888 	 if (doWarnings)
2889 	 {
2890 	    if (ptr - d > 66)
2891 	       fprintf( stderr, "warning: long domain name: %s\n", d );
2892 	 }
2893 
2894          if (!urlsIncluded)
2895          {
2896             first_slash = strchr( (char *) d, '/' );
2897             if (first_slash != NULL)
2898                fprintf( stderr, "warning: domain name (%s) has a '/'\n", d );
2899          }
2900 
2901 	 if (UFDBsanityCheckDomainname( (char *) d ))
2902 	    addDomain( admin, (char *) d, UFDBdomain );
2903       }
2904    }
2905 eof:
2906    fclose( fin );
2907 
2908    /* process the urls ***********************************************/
2909    if (urlsFileName != NULL)
2910    {
2911       fin = fopen( urlsFileName, "r" );
2912       if (fin == NULL)
2913       {
2914          char strbuf[128];
2915          strerror_r( errno, strbuf, sizeof(strbuf) );
2916 	 fprintf( stderr, "cannot read from \"%s\": %s\n", urlsFileName, strbuf );
2917 	 fclose( fout );
2918 	 unlink( tempOutFileName );
2919 	 usage();
2920       }
2921       if (ufdbGV.debug)
2922 	 fprintf( stderr, "processing urls from file \"%s\"\n", urlsFileName );
2923 
2924 readurls:
2925       while (!feof(fin))
2926       {
2927          int             last_char;
2928 	 unsigned char * qm;
2929 	 unsigned char * ptr;
2930 	 unsigned char * first_slash;
2931 
2932 	 qm = NULL;
2933 	 ptr = domain;
2934 
2935 	 while ((*ptr = last_char = myfast_getc(fin)) != '\n')
2936 	 {
2937 	    /* check for a last line without \n */
2938 	    if (last_char == EOF)
2939 	    {
2940 	       if (ptr != domain)
2941 		  break;
2942 	       goto eof2;
2943 	    }
2944 	    if (last_char == '\r')			/* Skip '\r' */
2945 	       continue;
2946 	    if (qm == NULL  &&  last_char == '?')	/* remember the first question mark */
2947 	       qm = ptr;
2948 	    if (last_char < ' ')			/* illegal control character in URL */
2949 	    {
2950 	       if (doWarnings)
2951 		  fprintf( stderr, "illegal control character in URL: %s\n", domain );
2952 	       *ptr = '\0';
2953 	       while (!feof(fin) && myfast_getc(fin) != '\n')
2954 		  ;
2955 	       break;
2956 	    }
2957 	    ptr++;
2958 	    if (ptr > &domain[4090])
2959 	    {
2960 	       *ptr = '\0';
2961 	       fprintf( stderr, "URL too long: %s\n", domain );
2962 	       while (!feof(fin) && myfast_getc(fin) != '\n')
2963 		  ;
2964 	       goto readurls;
2965 	    }
2966 	 }
2967 	 *ptr = '\0';
2968 
2969 	 d = domain;
2970 	 while (*d == ' ')
2971 	    d++;
2972 	 d = skipProtocol( d );
2973 	 if (d == NULL)
2974 	    d = domain;
2975 
2976 	 if (d[0] != '\0'  &&  d[0] != '#')
2977 	 {
2978 	    first_slash = (unsigned char *) strchr( (char *) d, '/' );
2979 	    if (first_slash == NULL)
2980 	    {
2981 	       if (doWarnings)
2982 	       {
2983 		  fprintf( stderr, "warning: URL has no '/': %s\n", d );
2984 		  if (strlen( (char *) d ) > 66)
2985 		     fprintf( stderr, "warning: long domainname in URL: %s\n", d );
2986 	       }
2987 	    }
2988 	    else
2989 	    {
2990 	       int    pathlen;
2991 
2992 	       if (qm != NULL)
2993 	       {
2994 		  /* make sure that the database format is 2.2+ when parameters are used */
2995 		  if (!URLparamSupport)
2996 		  {
2997                      if (doWarnings)
2998                         fprintf( stderr, "warning: URL with parameters is only supported in format 2.2 and newer.  URL: %s\n", d );
2999 		     numWarnings++;
3000 		     *qm = '\0';
3001 		     qm = NULL;
3002 		  }
3003 		  else
3004 		  {
3005 		     if (*(qm+1) == '\0')
3006 		     {
3007                         if (doWarnings)
3008                            fprintf( stderr, "warning: URL with '?' has no parameters and '?' is removed from URL %s\n", d );
3009                         *qm = '\0';
3010                         qm = NULL;
3011 		     }
3012 		  }
3013 	       }
3014 
3015 	       pathlen = calcpathlen( (char *) first_slash );
3016 	       if (qm != NULL)
3017 	       {
3018 		  unsigned int params_len;
3019 
3020                   /* TO-DO: the following check on the length of all parameters is a bit limiting
3021                    * TO-DO: instead must check length of each parameter and each value.
3022                    */
3023 		  params_len = calcpathlen( (char *) qm );
3024 	          pathlen -= params_len;
3025 		  if (params_len >= sizeof(UFDBurlPart))
3026 		  {
3027 		     if (doWarnings)
3028 			fprintf( stderr, "warning: parameter list exceeds %d characters and may be too long: %s\n",
3029                                  (int) sizeof(UFDBurlPart)-1, d );
3030 		  }
3031                   sortURLparams( qm+1 );
3032 	       }
3033 	       if (doWarnings)
3034 	       {
3035 		  if (first_slash - d > 66)
3036 		     fprintf( stderr, "warning: long domainname in URL: %s\n", d );
3037 		  if (pathlen > 127)
3038 		     fprintf( stderr, "warning: long path in URL: %s\n", d );
3039                   else if (pathlen >= (int) sizeof(UFDBurlPart))
3040 		     fprintf( stderr, "warning: very long URL: %s\n", d );
3041 	       }
3042 	    }
3043 
3044 	    /* does URL start with "www." ? */
3045             if (d[0] == 'w'  &&  d[1] == 'w'  &&  d[2] == 'w'  &&  d[3] == '.' && strchr( (char*) d+4, '.' ) != NULL)
3046 	    {
3047 	       if (stripWWW)
3048 	       {
3049 		  if (doWarnings)
3050 		     fprintf( stderr, "notice: \"www.\" is stripped for %s\n", d );
3051 	       }
3052 	       else if (doWarnings)
3053 	       {
3054 		  fprintf( stderr, "warning: URL name starts with \"www.\": %s (use -W option ?)\n", d );
3055 	       }
3056 	    }
3057 
3058 	    if (UFDBsanityCheckDomainname( (char *) d ))
3059 	       addDomain( admin, (char *) d, UFDBurl );
3060 	 }
3061       }
3062 eof2:
3063       fclose( fin );
3064    }
3065 
3066    if (encryptKey[0] == '\0')
3067       generateRandomKey( encryptKey );
3068    copyKey( key, encryptKey );
3069 
3070    /* write the table in binary format to the memory buffer */
3071    init_membuf();
3072    if (strmatch3( format, "2.0" ))
3073    {
3074       DEBUG(( stderr, "writing table in DB 2.0 format\n" ));
3075       createMemTable_2_0( table );
3076    }
3077    else if (strmatch3( format, "2.1" ))
3078    {
3079       DEBUG(( stderr, "writing table in DB 2.1 format\n" ));
3080       createMemTable_2_1( table );
3081    }
3082    else if (strmatch3( format, "2.2" ))
3083    {
3084       DEBUG(( stderr, "writing table in DB 2.2 format\n" ));
3085       createMemTable_2_2( table );
3086    }
3087 #if UFDB_DBFORMAT_3
3088    else if (strmatch3( format, "3.0" ))
3089    {
3090       DEBUG(( stderr, "writing table in DB 3.0 format\n" ));
3091       createMemTable_3_0( table, 0, UFDBunknownType, 0 );
3092    }
3093 #endif
3094    mem_putc( UFDBendTable );
3095 
3096    /* when SSE and AVX/AVX2 instructions are used we need a 32-byte safeguard */
3097    for (n = 0; n < 32; n++)
3098       mem_putc( UFDBendTable );
3099 
3100 #if 0
3101    /* various performance tests have showed that using strcmp() or SSE4.2 is slower than byte-by-byte
3102     * string comparison so strcmp() is not used in the performance-critical strcmpURLpart().
3103     */
3104    mem_puts( (unsigned char *) "64 padding characters since strcmp might use 256-bit lookahead..." );
3105 #endif
3106 
3107    if (format[0] < '3')
3108       numIndexNodes = numNodes - numLeafNodes;
3109 
3110    /* write the table header to the output file */
3111    strcpy( flags, "--------" );
3112    if (doBZ2compress)
3113       flags[0] = 'C';
3114    if (doZLIBcompress)
3115       flags[0] = 'Z';
3116    if (doProd)
3117       flags[1] = 'P';
3118    if (doCrypt)
3119       flags[2] = 'Q';
3120    if (doPadding)
3121       flags[3] = 'p';
3122    flags[4] = endian;
3123    now = time( NULL );
3124    gmtime_r( &now, &tm );
3125    sprintf( date, "%4d%02d%02d.%02d%02d",
3126             tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, tm.tm_hour, tm.tm_min );
3127    sprintf( header.string, "%s %s %s %ld key=%s date=%s %8s %ld %d\n\n",
3128             "UFDB", format, tableName, mem_i, key, date, flags, numNodes, 0 );
3129    fprintf( fout, "%s", header.string );
3130    hdr_size = (doPadding || format[0] >= '3') ?
3131                  sizeof(struct UFDBfileHeader21) : sizeof (struct UFDBfileHeader);
3132    for (n = hdr_size - strlen(header.string); n > 0; n--)
3133       myfast_putc( '\0', fout );
3134 
3135    if (printStatistics || ufdbGV.debug)
3136    {
3137       fprintf( stderr, "table header: %s", header.string );
3138       fprintf( stderr, "#nodes:  %9ld   #leafs:  %9ld   #index:  %9ld\n",
3139 	       numNodes, numLeafNodes, numIndexNodes );
3140 #if UFDB_DBFORMAT_3
3141       if (format[0] == '3')
3142       {
3143          fprintf( stderr, "#labels: %9ld   #chunks: %9ld   #ovrflw: %9ld   #fewLab:   %9ld\n",
3144                   numLabelNodes, numChnksStat, numOverflow, numFewLabels );
3145          fprintf( stderr, "#path8:  %9ld   #path16: %9ld   #pth255: %9ld   #manyPath: %9ld\n",
3146                   numVeryFewPaths, numFewPaths, num250Paths, numManyPaths  );
3147       }
3148 #endif
3149       fprintf( stderr, "#sub1:   %9ld   #sub2:   %9ld   #sub3:   %9ld   #sub4:     %9ld\n",
3150 	       numSub1, numSub2, numSub3, numSub4 );
3151       fprintf( stderr, "#sub5:   %9ld   #sub6:   %9ld   #sub7:   %9ld   #sub8:     %9ld\n",
3152 	       numSub5, numSub6, numSub7, numSub8 );
3153       fprintf( stderr, "#sub8+:  %9ld   #sub255+:%9ld   #sub64K+:%9ld\n",
3154 	       numSub8plus, numSub255plus, numSub64Kplus );
3155    }
3156 
3157    /* encrypt and compress the table: rewind, read, compress, crypt and write */
3158    if (doCrypt || doBZ2compress || doZLIBcompress)
3159    {
3160       doCryptCompress( fout, encryptKey, format );
3161    }
3162    else
3163    {
3164       fwrite( mem, mem_i, 1, fout );
3165    }
3166 
3167    fflush( fout );
3168    fdatasync( fileno(fout) );
3169    fclose( fout );
3170    free( fout_buffer );
3171 
3172    /* to get around some permission problems: unlink before rename */
3173    if (unlink( outFileName ) < 0  &&  errno != ENOENT)
3174    {
3175       char strbuf[128];
3176       strerror_r( errno, strbuf, sizeof(strbuf) );
3177       fprintf( stderr, "cannot remove \"%s\": %s\n", outFileName, strbuf );
3178    }
3179    if (rename( tempOutFileName, outFileName ) != 0)
3180    {
3181       char strbuf[128];
3182       strerror_r( errno, strbuf, sizeof(strbuf) );
3183       fprintf( stderr, "cannot rename '%s' into '%s': %s\n", tempOutFileName, outFileName, strbuf );
3184       (void) unlink( tempOutFileName );
3185       exit( 1 );
3186    }
3187    else if (ufdbGV.debug)
3188       fprintf( stderr, "temporary file \"%s\" renamed to \"%s\"\n", tempOutFileName, outFileName );
3189 
3190    if (numWarnings)
3191       fprintf( stderr, "%d warning(s)  ***\n", numWarnings );
3192 
3193    return 0;
3194 }
3195 
3196 
3197 /* since ufdbguard (single-threaded) and ufdbguardd (multi-threaded)
3198  * share source code, we put some pthread dummys here since we don't need/want pthreads.
3199  */
3200 
3201 #if GCC
3202 #pragma GCC diagnostic ignored "-Wunused-parameter"
3203 #endif
3204 
ufdb_mutex_lock(ufdb_mutex * m)3205 int ufdb_mutex_lock( ufdb_mutex * m __attribute__((unused)) )
3206 { return 0; }
3207 
ufdb_mutex_trylock(ufdb_mutex * m)3208 int ufdb_mutex_trylock( ufdb_mutex * m __attribute__((unused)) )
3209 { return 0; }
3210 
ufdb_mutex_unlock(ufdb_mutex * m)3211 int ufdb_mutex_unlock( ufdb_mutex * m __attribute__((unused)) )
3212 { return 0; }
3213 
ufdb_mutex_init(ufdb_mutex * m)3214 void ufdb_mutex_init( ufdb_mutex * m __attribute__((unused)) )
3215 { ; }
3216 
3217 #if 0
3218 int pthread_cond_signal( pthread_cond_t * cond __attribute__((unused)) )
3219 {
3220    return 0;
3221 }
3222 
3223 int pthread_cond_wait(
3224     pthread_cond_t * cond  __attribute__((unused)),
3225     pthread_mutex_t * mutex  __attribute__((unused)) )
3226 {
3227    return 0;
3228 }
3229 #endif
3230 
3231 
3232 #ifdef __cplusplus
3233 }
3234 #endif
3235