1 /*
2  * ufdblib.c - URLfilterDB
3  *
4  * ufdbGuard is copyrighted (C) 2005-2020 by URLfilterDB with all rights reserved.
5  *
6  * Parts of ufdbGuard are based on squidGuard.
7  * This module is NOT based on squidGuard.
8  *
9  * RCS $Id: ufdblib.c,v 1.171 2020/10/30 15:38:09 root Exp root $
10  */
11 
12 /* This module is well tested and stable for a long time.
13  * For maximum performance _FORTIFY_SOURCE is undefined.
14  */
15 #undef _FORTIFY_SOURCE
16 
17 #if (__GNUC__ > 4)  ||  (__GNUC__ == 4  &&  __GNUC_MINOR__ >= 4)
18 #pragma GCC push_options
19 #pragma GCC optimize ("O3")
20 #endif
21 
22 /* to inline string functions with gcc : */
23 #if defined(__OPTIMIZE__) && defined(__GNUC__)  &&  defined(GCC_INLINE_STRING_FUNCTIONS_ARE_FASTER)
24 #define __USE_STRING_INLINES  1
25 #endif
26 
27 #include "ufdb.h"
28 #include "ufdblib.h"
29 #include "ufdblocks.h"
30 
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <strings.h>
34 #include <string.h>
35 #include <unistd.h>
36 #include <ctype.h>
37 #include <time.h>
38 #include <errno.h>
39 #include <sys/types.h>
40 #include <sys/time.h>
41 
42 #if !UFDB_BARE_METAL_SUPPORT
43 #include <syslog.h>
44 /* TODO: evaluate use of syslog() */
45 #include <sys/stat.h>
46 #include <fcntl.h>
47 #include <poll.h>
48 #include <sys/socket.h>
49 #if HAVE_UNIX_SOCKETS
50 #include <sys/un.h>
51 #endif
52 #include <netdb.h>
53 #include <netinet/in.h>
54 #include <netinet/tcp.h>
55 #include <netinet/ip6.h>
56 // for inet_pton() and inet_ntop()
57 #include <arpa/inet.h>
58 #endif
59 
60 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
61 #include "cvmx-rtc.h"
62 #endif
63 
64 #ifdef __cplusplus
65 extern "C" {
66 #endif
67 
68 #if 0
69 #define strmatch2(a,b) (strcmp(a,b) == 0)
70 #define strmatch3(a,b) (strcmp(a,b) == 0)
71 #define strmatch4(a,b) (strcmp(a,b) == 0)
72 #define strmatch5(a,b) (strcmp(a,b) == 0)
73 #define strmatch6(a,b) (strcmp(a,b) == 0)
74 #define strmatch7(a,b) (strcmp(a,b) == 0)
75 #define strmatch8(a,b) (strcmp(a,b) == 0)
76 #else
77 #define strmatch2(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == '\0')
78 #define strmatch3(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == '\0')
79 #define strmatch4(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == '\0')
80 #define strmatch5(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  && (a)[5] == '\0')
81 #define strmatch6(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == '\0')
82 #define strmatch7(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == (b)[6]  &&  (a)[7] == '\0')
83 #define strmatch8(a,b)    ((a)[0] == (b)[0]  &&  (a)[1] == (b)[1]  &&  (a)[2] == (b)[2]  &&  (a)[3] == (b)[3]  &&  (a)[4] == (b)[4]  &&  (a)[5] == (b)[5]  &&  (a)[6] == (b)[6]  &&  (a)[7] == (b)[7]  &&  (a)[8] == '\0')
84 #endif
85 
86 /* #define  UFDB_DEBUG_IPV6 */
87 
88 /* #define UFDB_DO_DEBUG 1 */
89 #define UFDB_DO_DEBUG 0
90 
91 #if UFDB_DO_DEBUG || 0
92 #define DEBUG(x) fprintf x
93 #else
94 #define DEBUG(x)
95 #endif
96 
97 struct unknownURLentry;
98 
99 #define UFDB_UUE_DOMAIN_LENGTH   (128)
100 #define UFDB_UUE_PATH_LENGTH     (128)
101 #define UFDB_UUE_REFERER_LENGTH	 (128 - sizeof(struct unknownURLentry *) - sizeof(long))
102 
103 struct unknownURLentry
104 {
105    struct unknownURLentry * next;
106    long    nhits;
107    char    domain[UFDB_UUE_DOMAIN_LENGTH];
108    char    firstpath[UFDB_UUE_PATH_LENGTH];
109    char    referer[UFDB_UUE_REFERER_LENGTH];
110 };
111 
112 
113 
114 UFDB_GCC_HOT
findDomainEnd(char * url)115 inline static char * findDomainEnd( char * url )
116 {
117    return url + strcspn( url, "/?&;#" );
118 }
119 
120 
121 UFDB_GCC_HOT
strchr_before(char * str,char * maxstr,char chr)122 inline static char * strchr_before( char * str, char * maxstr, char chr )
123 {
124    return (char *) memchr( str, chr, maxstr - str );
125 }
126 
127 
128 UFDB_GCC_HOT
findProtocolEnd(char * URL)129 inline static char * findProtocolEnd( char * URL )
130 {
131    int i;
132 
133    i = strcspn( URL, ".:/?@#%" );
134    if (*(URL+i) == ':'  &&  *(URL+i+1) == '/'  &&  *(URL+i+2) == '/')
135       return URL + i;
136    return NULL;
137 }
138 
139 
UFDBappInit(void)140 void UFDBappInit( void )
141 {
142    ;
143 }
144 
145 
146 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
UFDBtimerInit(struct tms * t)147 void UFDBtimerInit( struct tms * t )
148 {
149 }
150 
UFDBtimerStop(struct tms * t)151 void UFDBtimerStop( struct tms * t )
152 {
153 }
154 
UFDBtimerPrintString(char * line,struct tms * t,const char * tag)155 void UFDBtimerPrintString( char * line, struct tms * t, const char * tag )
156 {
157    sprintf( line, "CPU time measurement not implemented on OCTEON" );
158 }
159 
UFDBtimerPrint(struct tms * t,const char * tag)160 void UFDBtimerPrint( struct tms * t, const char * tag )
161 {
162    if (ufdbGV.debug)
163       ufdbLogMessage( "UFDBtimerPrint not supported on OCTEON" );
164 }
165 
166 #else
167 
UFDBtimerInit(struct tms * t)168 void UFDBtimerInit( struct tms * t )
169 {
170    (void) times( t );
171 }
172 
173 
UFDBtimerStop(struct tms * t)174 void UFDBtimerStop( struct tms * t )
175 {
176    struct tms te;
177 
178    (void) times( &te );
179 
180    t->tms_utime  = te.tms_utime  - t->tms_utime;
181    t->tms_stime  = te.tms_stime  - t->tms_stime;
182    t->tms_cutime = te.tms_cutime - t->tms_cutime;
183    t->tms_cstime = te.tms_cstime - t->tms_cstime;
184 }
185 
186 
187 UFDB_SHARED static double  numTicks = 0.0;
188 
UFDBtimerPrintString(char * line,struct tms * t,const char * tag)189 void UFDBtimerPrintString( char * line, struct tms * t, const char * tag )
190 {
191    if (numTicks == 0.0)
192       numTicks = (double) sysconf( _SC_CLK_TCK );
193 
194    if (tag == NULL)
195       tag = "UFDB timer";
196 
197    sprintf( line, "%s:  %5.2f user  %5.2f sys  %5.2f total",
198 	    tag,
199             (double) t->tms_utime / numTicks,
200 	    (double) t->tms_stime / numTicks,
201 	    (double) (t->tms_utime+t->tms_stime) / numTicks );
202 }
203 
204 
UFDBtimerPrint(struct tms * t,const char * tag)205 void UFDBtimerPrint( struct tms * t, const char * tag )
206 {
207    char    line[256];
208 
209    UFDBtimerPrintString( line, t, tag );
210    ufdbLogMessage( "%s", line );
211 }
212 #endif
213 
214 
UFDBgettimeofday(struct timeval * tv,struct timezone * tz)215 int UFDBgettimeofday( struct timeval * tv, struct timezone * tz )
216 {
217 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
218    uint64_t ns = cvmx_read_csr( CVMX_MIO_PTP_CLOCK_HI );
219    if (ufdbGV.debug > 1)
220       ufdbLogMessage( "UFDBgettimeofday: CVMX_MIO_PTP_CLOCK_HI is %ld", ns );
221    tv->tv_sec = ns / (1000UL * 1000 * 1000);
222    tv->tv_usec = ns - (tv->tv_sec * (1000UL * 1000 * 1000));
223    return 0;
224 #else
225    return gettimeofday( tv, tz );
226 #endif
227 }
228 
229 
UFDBtime(void)230 time_t UFDBtime( void )
231 {
232 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
233    uint64_t ns = cvmx_read_csr( CVMX_MIO_PTP_CLOCK_HI );
234    if (ufdbGV.debug > 1)
235       ufdbLogMessage( "UFDBtime: CVMX_MIO_PTP_CLOCK_HI is %ld", ns );
236    return ns / (1000UL * 1000 * 1000);
237    // return (time_t) cvmx_rtc_read();
238 #else
239    return time( NULL );
240 #endif
241 }
242 
243 
UFDBhasRTC(void)244 int UFDBhasRTC( void )
245 {
246 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
247    return cvmx_rtc_supported() != 0;
248 #else
249    return 1;
250 #endif
251 }
252 
253 
removeObfuscations(char * domain,char * URL)254 static void removeObfuscations( char * domain, char * URL )
255 {
256    char * d;
257    char * s;
258    char * db;
259    int    n, r;
260    int    distance;
261    int    oldDomainLength;
262    int    newDomainLength;
263    int    octetStart;
264    char   newDomain[256];
265    char   decimalBuf[16];
266 
267    /*
268     *  Rewrite the domain and URL in buffer to remove the obfuscation
269     *  First put a normalised domain in newDomain,
270     *  then copy back newDomain to domain and rewrite a new URL in buffer.
271     */
272    s = domain;
273    d = newDomain;
274 
275    /* remove obfuscations: parse hex and octal numbers in the octects and rewrite them in decimal notation */
276    octetStart = 1;
277    while (*s != '\0')
278    {
279       if (*s == '0'  &&  octetStart)
280       {
281 	 s++;
282 	 n = 0;
283 
284 	 /* interpret octal or hex ? */
285 	 if (*s == 'x')
286 	 {
287 	    s++;
288 	    while (isxdigit((int) *s))
289 	    {
290 	       n = n * 16 + ((*s <= '9') ? (*s - '0') : (*s - 'a' + 10));
291 	       s++;
292 	    }
293 	 }
294 	 else if (isdigit((int) *s))
295 	 {
296 	    while (isdigit((int) *s))
297 	    {
298 	       n = n * 8 + (*s - '0');
299 	       s++;
300 	    }
301 	 }
302 	 else
303 	 {
304 	    ;	/* it was a single '0' */
305 	 }
306 
307 
308 	 db = &decimalBuf[ sizeof(decimalBuf) - 1 ];
309 	 *db = '\0';
310 	 do			/* write the number in decimal notation (from right to left) */
311 	 {
312 	    db--;
313 	    r = n % 10;
314 	    *db = r + '0';
315 	    n = n / 10;
316 	 } while (n > 0);
317 	 while (*db != '\0')	/* copy the decimal number to the newDomain */
318 	    *d++ = *db++;
319       }
320       else
321       {
322 	 octetStart = (*s == '.');
323 	 *d++ = *s++;
324       }
325    }
326    *d = '\0';
327 
328    oldDomainLength = s - domain;
329    newDomainLength = d - newDomain;
330 
331    /* update the domain */
332    for (s = newDomain, d = domain;  *s != '\0';  s++)
333       *d++ = *s;
334    *d = '\0';
335 
336    /* update the URL */
337    if (newDomainLength < oldDomainLength)
338    {
339       s = domain;
340       d = URL;
341       while (*s != '\0')
342          *d++ = *s++;
343       /* the shorter new domain is copied to the URL; now move the path inside the URL */
344       distance = oldDomainLength - newDomainLength;
345       while (*(d+distance) != '\0')
346       {
347          *d = *(d+distance);
348 	 d++;
349       }
350       *d = '\0';
351    }
352    else if (newDomainLength == oldDomainLength)
353    {
354       s = domain;
355       d = URL;
356       while (*s != '\0')
357          *d++ = *s++;
358       /* the path does not need any updating */
359    }
360    else
361    {
362       /* move the path and then copy in the larger domain (extremely unlikely) */
363       int pathLength;
364       pathLength = strlen( URL + oldDomainLength );
365       memmove( URL+newDomainLength, URL+oldDomainLength, pathLength );
366 
367       s = domain;
368       d = URL;
369       while (*s != '\0')
370          *d++ = *s++;
371    }
372 }
373 
374 
375 UFDB_GCC_ALIGN_CL
376 UFDB_SHARED static ufdb_mutex mutex_url_history = ufdb_mutex_initializer;
377 
378 UFDB_GCC_ALIGN_CL
379 UFDB_SHARED static struct unknownURLentry * uues = NULL;     // malloc-ed array with UFDB_UUE_MAXURLS structs
380 UFDB_SHARED static char *                   uue_list = NULL;
381 UFDB_SHARED static volatile int 	    n_uue = -1;
382 
383 UFDB_GCC_ALIGN_CL
384 UFDB_SHARED static struct unknownURLentry * uueHash[UFDB_UUE_HASHSIZE];         // TODO: also malloc ?
385 
386 #define UUE_LIST_SIZE           (UFDB_UUE_MAXURLS *  \
387                                     (UFDB_UUE_DOMAIN_LENGTH + UFDB_UUE_PATH_LENGTH + 1 + 7 + 1 +  \
388                                      UFDB_UUE_REFERER_LENGTH + 1))
389 
390 
391 /* Store the domainname of a URL in the uue hashtable.
392  * If it already exists, increment the nhits counter.
393  */
ufdbRegisterUnknownURL(char * webserver,int portnumber,char * referer)394 void ufdbRegisterUnknownURL(
395    char * webserver,
396    int    portnumber,
397    char * referer )
398 {
399    int                       length;
400    unsigned int              hv;
401    char *                    path;
402    struct unknownURLentry  * he;
403    struct unknownURLentry ** prev;
404    char                      domain[UFDB_UUE_DOMAIN_LENGTH];
405 
406    if (ufdbGV.reconfig != UFDB_RECONFIGR_NONE  ||  n_uue >= UFDB_UUE_MAXURLS)
407       return;
408 
409    /* webserver names that do not have a dot (.) have no domain name          */
410    /* and can never be incorporated in the URL database and are skipped here. */
411    if (webserver == NULL)
412       return;
413 
414    path = findDomainEnd( webserver );
415    length = path - webserver;
416    if (length == 0  ||  strchr( webserver, '.' ) == NULL)
417       return;
418 
419    if (ufdbGV.debug > 1)
420       ufdbLogMessage( "ufdbRegisterUnknownURL: %s %d %s", webserver, portnumber, path );
421 
422    if (length < UFDB_UUE_DOMAIN_LENGTH - 1)
423    {
424       strncpy( domain, webserver, length );
425       domain[length] = '\0';
426       if (portnumber != 80  &&  length < UFDB_UUE_DOMAIN_LENGTH - 6)
427          sprintf( &domain[length], ":%d", portnumber );
428    }
429    else
430    {
431       char * orig;
432 
433       /* the domainname is huge :-( */
434       /* reduce the length of the domainname by stripping as many subdomains as required */
435       orig = webserver;
436       webserver += (length - (UFDB_UUE_DOMAIN_LENGTH - 2 - 2));
437       while (*webserver != '\0'  &&  *webserver != '.')
438          webserver++;
439       if (*webserver == '\0')
440       {
441          ufdbLogError( "ufdbRegisterUnknownURL: cannot store very long domainname \"%s\"", orig );
442          return;
443       }
444       webserver++;
445       path = findDomainEnd( webserver );
446       length = path - webserver;
447       domain[0] = '*';
448       domain[1] = '.';
449       strncpy( domain+2, webserver, length );
450       domain[length+2] = '\0';
451       if (length == 0  ||  strchr( domain+2, '.' ) == NULL)
452          return;
453       if (portnumber != 80  &&  length < UFDB_UUE_DOMAIN_LENGTH - 6 - 2 - 1)
454          sprintf( &domain[length], ":%d", portnumber );
455    }
456 
457    /* calculate hash */
458    hv = 100003;
459    {
460       char * t;
461       for (t = domain;  *t != '\0';  t++)
462          hv = (hv ^ (hv << 11)) ^ (*t * 17);
463    }
464    hv = hv % UFDB_UUE_HASHSIZE;
465 
466    ufdb_mutex_lock( &mutex_url_history );		/* ======================================= */
467 
468    if (uues == NULL)
469    {
470       uues = (struct unknownURLentry*) ufdbCalloc( sizeof(struct unknownURLentry), UFDB_UUE_MAXURLS );
471       n_uue = 0;
472    }
473 
474    /* check again for full table */
475    if (n_uue >= UFDB_UUE_MAXURLS)
476    {
477       ufdb_mutex_unlock( &mutex_url_history );				        /* ============= */
478       return;
479    }
480 
481    /* find domain in uue table */
482    prev = &uueHash[hv];
483    he = uueHash[hv];
484    while (he != NULL)
485    {
486       if (strcmp( domain, he->domain ) == 0)
487       {
488 	 /* found in hashtable */
489          /* The URL path is saved in the hashtable, but the first one is never interesting,
490           * so overwrite it with number 2 and 3.
491           */
492          if (he->nhits < 3)
493          {
494             ufdbStrncpy( he->firstpath, path, sizeof(he->firstpath) );
495             if (referer != NULL)
496                ufdbStrncpy( he->referer, referer, sizeof(he->referer) );
497          }
498          he->nhits++;
499 	 ufdb_mutex_unlock( &mutex_url_history );				/* ============= */
500 	 return;
501       }
502       prev = &(he->next);
503       he = he->next;
504    }
505 
506    /* domain was not found in hashtable; prev points to pointer to assign new entry */
507    he = &uues[n_uue];
508    n_uue++;
509    *prev = he;
510    he->next = NULL;
511    he->nhits = 1;
512    strcpy( he->domain, domain );
513    strcpy( he->firstpath, path );
514    if (referer == NULL)
515       he->referer[0] = '\0';
516    else
517       ufdbStrncpy( he->referer, referer, sizeof(he->referer) );
518 
519    ufdb_mutex_unlock( &mutex_url_history );		        /* ======================================= */
520 
521    return;
522 }
523 
524 
525 /*
526  * Retrieve all registered uncategorised URLs.
527  * This must be followed by a ufdbResetUnknownURLs().
528  */
ufdbGetUnknownURLs(void)529 char * ufdbGetUnknownURLs( void )
530 {
531    int    i;
532    char * tail;
533    struct unknownURLentry * he;
534 
535    ufdb_mutex_lock( &mutex_url_history );		// >=============================================
536 
537    n_uue = UFDB_UUE_MAXURLS;				// prevent additions until call to ufdbResetUnknownURLs()
538 
539    if (uue_list == NULL)
540       uue_list = (char*) ufdbMalloc( UUE_LIST_SIZE );
541    tail = uue_list;
542 
543    *tail++ = '|';
544    *tail++ = 'N';	// "N1" signals the receiver that the list has "<url> # <nhits>"
545    *tail++ = '1';
546    *tail++ = '|';
547    for (i = 0;  i < UFDB_UUE_HASHSIZE;  i++)
548    {
549       for (he = uueHash[i];  he != NULL;  he = he->next)
550       {
551 	 tail += sprintf( tail, "%s%s#%ld#%s|", he->domain, he->firstpath, he->nhits, he->referer );
552 	 if (tail > uue_list + UUE_LIST_SIZE - sizeof(struct unknownURLentry))
553 	    goto too_many;
554       }
555    }
556 too_many:
557    *tail = '\0';
558 
559    ufdb_mutex_unlock( &mutex_url_history );		// <=============================================
560 
561    return uue_list;
562 }
563 
564 
ufdbResetUnknownURLs(void)565 void ufdbResetUnknownURLs( void )
566 {
567    int i;
568 
569    ufdb_mutex_lock( &mutex_url_history );		// >=======================================
570 
571    if (uues == NULL)
572    {
573       uues = (struct unknownURLentry*) ufdbCalloc( sizeof(struct unknownURLentry), UFDB_UUE_MAXURLS );
574    }
575    n_uue = 0;
576 
577    if (uue_list != NULL)
578    {
579       ufdbFree( uue_list );
580       uue_list = NULL;
581    }
582 
583    for (i = 0;  i < UFDB_UUE_HASHSIZE;  i++)
584       uueHash[i] = NULL;
585 
586    ufdb_mutex_unlock( &mutex_url_history );		// <=======================================
587 }
588 
589 
590 #if !UFDB_BARE_METAL_SUPPORT
UFDBopenSocket(const char * serverName,int port)591 int UFDBopenSocket( const char * serverName, int port )
592 {
593    int                 s;
594    int                 ret;
595    int                 sock_parm;
596    long                oldflags;
597    time_t              t0, t;
598    char                servicePort[8];
599    struct addrinfo *   addrlist;
600    struct addrinfo *   al;
601    struct addrinfo     addrinfo_hints;
602    struct timeval      tv;
603 
604    addrlist = NULL;
605    sprintf( servicePort, "%d", port );
606 
607    addrinfo_hints.ai_flags = AI_NUMERICSERV;
608 #ifdef AI_IDN
609    addrinfo_hints.ai_flags |= AI_IDN;
610 #endif
611 #ifdef AI_ADDRCONFIG
612    addrinfo_hints.ai_flags |= AI_ADDRCONFIG;
613 #endif
614    /* use AF_UNSPEC for IPv4 or IPv6; AF_INET for IPv4 only */
615    addrinfo_hints.ai_family = ufdbGV.useAlsoIPv6onWan ? AF_UNSPEC : AF_INET;
616    addrinfo_hints.ai_socktype = SOCK_STREAM;
617    addrinfo_hints.ai_protocol = IPPROTO_TCP;
618    addrinfo_hints.ai_addrlen = 0;
619    addrinfo_hints.ai_addr = NULL;
620    addrinfo_hints.ai_canonname = NULL;
621    addrinfo_hints.ai_next = NULL;
622 
623    t0 = time( NULL );
624    ret = getaddrinfo( serverName, servicePort, &addrinfo_hints, &addrlist );
625    if (ret != 0)
626    {
627       t = time( NULL );
628       if (t - t0 >= 4)
629 	 ufdbLogMessage( "UFDBopenSocket: cannot resolve hostname %s: %s - %s. "
630 	                 "getaddrinfo took %ld seconds to return.  Is the DNS server OK?  *****",
631 	                 serverName, gai_strerror(ret),
632                          ret == EAI_SYSTEM ? strerror(errno) : "errno is not set",
633                          (t - t0) );
634       else
635       {
636 	 ufdbLogMessage( "UFDBopenSocket: cannot resolve hostname '%s': %s - %s",
637                          serverName, gai_strerror(ret),
638                          ret == EAI_SYSTEM ? strerror(errno) : "errno is not set" );
639       }
640 #ifdef EAGAIN
641       errno = EAGAIN;
642 #endif
643       return -1;
644    }
645 
646    int numTries = 0;
647    s = -1;
648    al = addrlist;
649    while (al != NULL  &&  numTries < 6)         // try to connect to many returned addresses until one succeeds
650    {
651       char addrbuf[INET6_ADDRSTRLEN+1];
652 
653       numTries++;
654       if (al->ai_family == AF_INET)
655       {
656 	 if (inet_ntop( al->ai_family, &((struct sockaddr_in *) al->ai_addr)->sin_addr, addrbuf,
657                         sizeof(addrbuf) ) == NULL)
658 	    strcpy( addrbuf, "unknown" );
659       }
660       else
661       {
662 	 if (inet_ntop( al->ai_family, &((struct sockaddr_in6 *) al->ai_addr)->sin6_addr, addrbuf,
663                         sizeof(addrbuf) ) == NULL)
664 	    strcpy( addrbuf, "unknown" );
665       }
666       if (ufdbGV.debug > 1  ||  ufdbGV.peek)
667       {
668 	 ufdbLogMessage( "UFDBopenSocket: '%s' resolved to '%s', port %d  try #%d",
669                          serverName, addrbuf, port, numTries );
670       }
671 
672       s = socket( al->ai_family, SOCK_STREAM, 0 );
673       if (s < 0)
674       {
675          if (ufdbGV.debug > 1  ||  ufdbGV.peek)
676          {
677             ufdbLogMessage( "UFDBopenSocket: could not open socket to '%s' (%s) port %d: %s - %s",
678                             serverName, addrbuf, port, strerror(errno),
679                             al->ai_next == NULL ? "no other IP addresses to try" : "trying next IP address" );
680          }
681          al = al->ai_next;
682 	 continue;	/* we may not be able to make an IPv6 socket, so continue trying other sockets */
683       }
684 
685       /*
686        *  Prevent that the connect takes ages.  Use an aggressive timeout of 8 seconds.
687        */
688       tv.tv_sec = 8;
689       tv.tv_usec = 0;
690       setsockopt( s, SOL_SOCKET, SO_RCVTIMEO, (void *) &tv, sizeof(tv) );
691       tv.tv_sec = 8;
692       tv.tv_usec = 0;
693       setsockopt( s, SOL_SOCKET, SO_SNDTIMEO, (void *) &tv, sizeof(tv) );
694 
695       sock_parm = 200 * 1024;
696       setsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *) &sock_parm, sizeof(sock_parm) );
697 
698       /* The RCVTIMEO and SNDTIMEO do not work on all OSes (e.g. Solaris) so
699        * we will set the socket in non-blocking mode, use select with a timeout and
700        * check the socket status for successful connection.
701        * After this, the socket goes back to blocking mode.
702        */
703       oldflags = fcntl( s, F_GETFL, NULL );
704       if (oldflags < 0)
705 	 oldflags = 0;   /* ignore errors */
706       if (fcntl( s, F_SETFL, oldflags|O_NONBLOCK ))
707       {
708 	 ;
709       }
710 
711       /* with anti-aliasing warnings ON, connect/bind cause compiler warnings which we may ignore */
712       if (connect( s, al->ai_addr, al->ai_addrlen) < 0)
713       {
714 	 if (errno == EINPROGRESS)
715 	 {
716             struct pollfd pfd;
717             pfd.fd = s;
718             pfd.events = POLLIN|POLLOUT|POLLPRI|POLLHUP;
719             pfd.revents = 0;
720             int rv = poll( &pfd, 1, 8000 );
721             if (ufdbGV.debug > 1) ufdbLogMessage( "poll() returned %d", rv );
722 	    if (rv < 0)
723 	    {
724 	       /* error; fall through */
725 	    }
726 	    else if (rv == 0)
727 	    {
728 	       /* timed out; fall through */
729 	       errno = ETIMEDOUT;
730                numTries++;              // increment numTries again to have max. 3 timeouts
731 	    }
732 	    else
733 	    {
734 	       /* poll() signalled that there is I/O. check socket error status */
735 	       int errorStatus;
736 	       socklen_t len;
737 	       len = sizeof(int);
738 	       if (getsockopt( s, SOL_SOCKET, SO_ERROR, (void *) &errorStatus, &len ) == 0)
739 	       {
740 		  if (!errorStatus)
741 		  {
742 		     /* go back to blocking mode */
743 		     if (fcntl( s, F_SETFL, oldflags ))
744 		     {
745 			;
746 		     }
747 		     goto socket_ok;
748 		  }
749 		  /* error; fall through */
750 		  if (errno == 0)
751 		     errno = ECONNREFUSED;
752 	       }
753 	       else
754 	       {
755 		  /* error; fall through */
756 	       }
757 	    }
758 	 }
759 	 if (errno == EINPROGRESS)
760 	    errno = EAGAIN;
761 	 ufdbLogError( "UFDBopenSocket: cannot connect to %s/%s  port %d: %s - %s",
762                        serverName, addrbuf, port, strerror(errno),
763                        al->ai_next == NULL ? "no other addresses to try" : "trying next IP address" );
764 	 close( s );
765 	 s = -1;
766       }
767 
768       al = al->ai_next;
769    }
770 socket_ok:
771    freeaddrinfo( addrlist );
772 
773 #if 0
774    if (al == NULL)
775       return -1;
776 #endif
777 
778    if (s >= 0)
779    {
780       /*
781        *  Prevent long blocking on communication with the other side.
782        */
783       tv.tv_sec = 20;
784       tv.tv_usec = 0;
785       setsockopt( s, SOL_SOCKET, SO_RCVTIMEO, (void *) &tv, sizeof(tv) );
786 
787       tv.tv_sec = 20;
788       tv.tv_usec = 0;
789       setsockopt( s, SOL_SOCKET, SO_SNDTIMEO, (void *) &tv, sizeof(tv) );
790    }
791 
792    return s;
793 }
794 #endif
795 
796 
797 #if UFDB_BARE_METAL_SUPPORT
798 // On bare metal we do not have arpa/inet.h nor inet_pton() and inet_ntop()
799 // so to normalise an IPv6 address we simply only downcase and remove leading zeroes
UFDBnormaliseIPv6(const char * address,char * normalised)800 inline static int UFDBnormaliseIPv6(
801    const char * address,
802    char *       normalised )
803 {
804    while (*address != '\0')
805    {
806       // start of a field
807       if (*address == '0')
808       {
809 	 while (*address == '0')
810 	    address++;
811 	 if (*address == '\0'  ||  *address == ':')
812 	    *normalised++ = '0';
813       }
814       while (*address != '\0'  &&  *address != ':')
815       {
816 	 *normalised = tolower( (int) *address );
817 	 address++;
818 	 normalised++;
819       }
820       if (*address == ':')
821          *normalised++ = *address++;
822    }
823    *normalised = '\0';
824 
825    return 1;	/* successful */
826 }
827 
828 #else
829 
UFDBnormaliseIPv6(const char * address,char * normalised)830 inline static int UFDBnormaliseIPv6(
831    const char * address,
832    char *       normalised )
833 {
834    struct in6_addr ipv6;
835 
836    *normalised = '\0';
837 
838    if (inet_pton( AF_INET6, address, (void *) &ipv6 ) <= 0)
839       return 0;
840 
841    if (inet_ntop( AF_INET6, (void *) &ipv6, normalised, INET6_ADDRSTRLEN ) == NULL)
842       return 0;
843 
844 #ifdef UFDB_DEBUG_IPV6
845    ufdbLogMessage( "      UFDBnormaliseIPv6  %s  ->  %s", address, normalised );
846 #endif
847 
848    return 1;	/* successful */
849 }
850 #endif
851 
852 
853 #ifndef INET6_ADDRSTRLEN
854 #define INET6_ADDRSTRLEN 48
855 #endif
856 
UFDBparseIPv6address(char * url,char * domain)857 char * UFDBparseIPv6address(
858    char * url,
859    char * domain )
860 {
861    char * url_start;
862    char * d;
863    char   normalisedAddress[INET6_ADDRSTRLEN];
864 
865 #ifdef UFDB_DEBUG_IPV6
866    ufdbLogMessage( "   UFDBparseIPv6address: url: %s", url );
867 #endif
868 
869    url_start = url;
870    *domain = '\0';
871    d = domain;
872    if (*url == '[')
873    {
874       url++;
875    }
876 
877    while (*url != '\0')
878    {
879       if (*url == ']')
880       {
881 	 *d++ = ']';
882 	 *d = '\0';
883 	 if (UFDBnormaliseIPv6( domain, normalisedAddress ))
884 	    strcpy( domain, normalisedAddress );
885 	 else
886 	 {
887 	    if (ufdbGV.debug)
888 	        ufdbLogMessage( "URL has invalid IPv6 address: %s", *url_start=='[' ? url_start+1 : url_start );
889 	    return NULL;
890 	 }
891 	 /* TODO: handle IPv4 in IPv6 addresses e.g.  ::127.0.0.1 */
892 	 return url;
893       }
894 
895       if (*url == ':'  ||  *url == '.'  ||  isxdigit( (int) *url))
896       {
897          *d++ = *url++;
898       }
899       else	/* URL address error */
900       {
901 	 *d = '\0';
902 	 if (ufdbGV.debug)
903 	     ufdbLogMessage( "URL has invalid IPv6 address: %s", *url_start=='[' ? url_start+1 : url_start );
904          return NULL;
905       }
906    }
907    *d = '\0';
908 
909 #ifdef UFDB_DEBUG_IPV6
910    ufdbLogMessage( "   UFDBparseIPv6address: domain: %s", domain );
911 #endif
912 
913    if (UFDBnormaliseIPv6( domain, normalisedAddress ))
914    {
915 #ifdef UFDB_DEBUG_IPV6
916       ufdbLogMessage( "      IPv6 domain '%s' normalised to '%s'", domain, normalisedAddress );
917 #endif
918       strcpy( domain, normalisedAddress );
919    }
920    else
921    {
922       if (ufdbGV.debug)
923 	  ufdbLogMessage( "URL has invalid IPv6 address: %s", *url_start=='[' ? url_start+1 : url_start );
924       return NULL;  /* address error */
925    }
926 
927    /* TODO: handle IPv4 in IPv6 addresses e.g.  ::127.0.0.1 */
928    return url;
929 }
930 
931 
UFDBupdateURLwithNormalisedDomain(char * url,char * newDomain)932 void UFDBupdateURLwithNormalisedDomain(
933    char * url,
934    char * newDomain )
935 {
936 #ifdef UFDB_DEBUG_IPV6
937    char * oldURL;
938 #endif
939    char * oldEnd;
940    int    n;
941    int    nbytes;
942 
943 #ifdef UFDB_DEBUG_IPV6
944    oldURL = url;
945    ufdbLogMessage( "      UFDBupdateURLwithNormalisedDomain: %s", url );
946 #endif
947 
948    if (*url != '[')
949    {
950       ufdbLogError( "UFDBupdateURLwithNormalisedDomain: URL does not start with '[': %s", url );
951       return;
952    }
953    url++;
954 
955    oldEnd = strchr( url, ']' );
956    if (oldEnd == NULL)
957    {
958       ufdbLogError( "UFDBupdateURLwithNormalisedDomain: URL does not have a ']': %s", url );
959       return;
960    }
961 
962    while (1)
963    {
964       if (*url == ']')
965       {
966          if (*newDomain == '\0')	/* the normalised domain name has equal length */
967 	    return;
968 	 /* the newDomain string is longer than the original */
969 	 n = strlen( newDomain );
970 	 nbytes = strlen( url ) + 1;
971 	 memmove( url+n, url, nbytes );
972 	 while (*newDomain != '\0')
973 	    *url++ = *newDomain++;
974 	 return;
975       }
976 
977       if (*newDomain == '\0')
978       {
979          /* the newDomain string is shorter than the original */
980 	 nbytes = strlen( oldEnd ) + 1;
981 	 memmove( url, oldEnd, nbytes );
982 #ifdef UFDB_DEBUG_IPV6
983          ufdbLogMessage( "      UFDBupdateURLwithNormalisedDomain: %s", oldURL );
984 #endif
985 	 return;
986       }
987 
988       *url++ = *newDomain++;
989    }
990 }
991 
992 
UFDBnormaliseIPv4(char * domain)993 void UFDBnormaliseIPv4( char * domain )
994 {
995    char *       d;
996    char *       orig;
997    unsigned int octetvalue;
998    char         dbuf[512];
999 
1000    orig = domain;
1001    d = dbuf;
1002    while (*domain != '\0')
1003    {
1004       if (*domain == '0')
1005       {
1006 	 domain++;
1007 	 octetvalue = 0;
1008          if (*domain == 'x')				/* obfuscated hexadecimal octet */
1009 	 {
1010 	    domain++;
1011 	    while (isxdigit((int) *domain))
1012 	    {
1013 	       octetvalue *= 16;
1014 	       if (*domain >= '0' && *domain <= '9')
1015 	          octetvalue += (*domain - '0');
1016 	       else
1017 	          octetvalue += (*domain - 'a' + 10);
1018 	       domain++;
1019 	    }
1020 	    if (*domain != '\0'  &&  *domain != '.')
1021 	    {
1022 	       ufdbLogError( "IPv4 address has illegal hexadecimal octet: %s", orig );
1023 	       return;
1024 	    }
1025 	 }
1026 	 else if (*domain >= '0'  &&  *domain <= '7')	/* obfuscated octal octet */
1027 	 {
1028 	    while (*domain >= '0'  &&  *domain <= '7')
1029 	    {
1030 	       octetvalue *= 8;
1031 	       octetvalue += (*domain - '0');
1032 	       domain++;
1033 	    }
1034 	    if (*domain != '\0'  &&  *domain != '.')
1035 	    {
1036 	       ufdbLogError( "IPv4 address has illegal octal octet: %s", orig );
1037 	       return;
1038 	    }
1039 	 }
1040 	 else
1041 	 {
1042 	    ufdbLogError( "IPv4 address has illegal octet: %s", orig );
1043 	    return;
1044 	 }
1045 	 if (octetvalue > 255)
1046 	 {
1047 	    ufdbLogError( "obfuscated IPv4 address has illegal octet value: %s", orig );
1048 	    return;
1049 	 }
1050 	 /* convert the octetvalue to a decimal string */
1051 	 d += sprintf( d, "%u", octetvalue );
1052       }
1053       else				/* octet is not obfuscated */
1054       {
1055          while (*domain != '\0'  &&  *domain != '.')
1056 	    *d++ = *domain++;
1057       }
1058 
1059       if (*domain == '.')
1060 	 *d++ = *domain++;
1061    }
1062    *d = '\0';
1063 
1064    if (ufdbGV.debug > 1)
1065       ufdbLogMessage( "obfuscated domain %s rewritten to %s", orig, dbuf );
1066 
1067    strcpy( orig, dbuf );
1068 }
1069 
1070 
UFDBaddYoutubeEdufilter(char * domain,char * strippedURL,char * originalURL)1071 int UFDBaddYoutubeEdufilter(
1072    char * domain,
1073    char * strippedURL,
1074    char * originalURL  )
1075 {
1076    char * dot;
1077 
1078    if (strcmp( domain, "youtube.com" ) == 0)
1079    {
1080       char * id = ufdbGV.YoutubeEdufilterID;
1081       if (id == NULL)
1082          return UFDB_API_OK;
1083 #if 0
1084       ufdbLogMessage( "   YouTube Edufilter: %s %s", domain, strippedURL );
1085 #endif
1086       dot = strrchr( strippedURL, '.' );
1087       if (dot == NULL)
1088       {
1089 	 if (strchr( strippedURL, '?' ) == NULL)
1090 	    strcat( originalURL, "?edufilter=" );
1091 	 else
1092 	    strcat( originalURL, "&edufilter=" );
1093 	 strcat( originalURL, id );
1094 
1095 	 return UFDB_API_MODIFIED_FOR_YOUTUBE_EDUFILTER;
1096       }
1097       else
1098       {
1099 	 if (strcmp( dot+1, "css" ) != 0  &&
1100 	     strcmp( dot+1, "ico" ) != 0  &&
1101 	     strcmp( dot+1, "gif" ) != 0  &&
1102 	     strcmp( dot+1, "jpg" ) != 0  &&
1103 	     strcmp( dot+1, "png" ) != 0  &&
1104 	     strcmp( dot+1, "js" )  != 0  &&
1105 	     strcmp( dot+1, "xml" ) != 0)
1106 	 {
1107 	    if (strchr( dot, '?' ) == NULL)
1108 	       strcat( originalURL, "?edufilter=" );
1109 	    else
1110 	       strcat( originalURL, "&edufilter=" );
1111 	    strcat( originalURL, id );
1112 
1113 	    return UFDB_API_MODIFIED_FOR_YOUTUBE_EDUFILTER;
1114 	 }
1115       }
1116    }
1117 
1118    return UFDB_API_OK;
1119 }
1120 
1121 
1122 /*
1123  * UFDBaddSafeSearch - modify a URL for a search which requires SafeSearch
1124  *
1125  * return UFDB_API_OK for unmodified URLs and UFDB_API_MODIFIED_FOR_SAFESEARCH
1126  *
1127  * parameters: domain - the domainname
1128  *             strippedURL - the stripped URL including the domainname
1129  *             originalURL - the unmodified user-supplied URL
1130  *	       The originalURL must be of type char[UFDB_MAX_URL_LENGTH]
1131  *	       and may be modified to force SafeSearch.
1132  */
UFDBaddSafeSearch(char * domain,char * strippedURL,char * originalURL)1133 int UFDBaddSafeSearch(
1134    char * domain,
1135    char * strippedURL,
1136    char * originalURL  )
1137 {
1138    char * slash;
1139 
1140    originalURL[UFDB_MAX_URL_LENGTH-28] = '\0';
1141 
1142    slash = strchr( strippedURL, '/' );
1143    if (slash == NULL)
1144       strippedURL = (char *) "";
1145    else
1146       strippedURL = slash;
1147 
1148 #if 0
1149    ufdbLogMessage( "   SS: %s %s", domain, strippedURL );
1150 #endif
1151 
1152    if (strstr( domain, "similar-images.googlelabs." ) != NULL  &&	/* Google images */
1153        strstr( strippedURL, "q=") != NULL)
1154    {
1155       strcat( originalURL, "&safe=active&safeui=on" );
1156       UFDB_API_num_safesearch++;
1157       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1158    }
1159    else
1160    if (strstr( domain, "images.google." ) != NULL  &&			/* Google images */
1161        strstr( strippedURL, "q=") != NULL)
1162    {
1163       strcat( originalURL, "&safe=active&safeui=on" );
1164       UFDB_API_num_safesearch++;
1165       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1166    }
1167    else
1168    if (strstr( domain, "youtube.com" ) != NULL  &&			/* Youtube */
1169        strstr( strippedURL, "search_query=") != NULL)
1170    {
1171       strcat( originalURL, "&safety_mode=true" );			/* unfortunately this does not work since */
1172       UFDB_API_num_safesearch++;					/* also need to set Cookie: (.*) PREF=f2=8000000 */
1173       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1174    }
1175    else
1176    if ((domain[0] <= '9' && domain[0] >= '0')  &&			/* google-related sites like www.google-tr.info */
1177        strstr( strippedURL, "cx=partner" ) != NULL  &&
1178        strstr( strippedURL, "/cse" ) != NULL  &&
1179        strstr( strippedURL, "q=" ) != NULL)
1180    {
1181       strcat( originalURL, "&safe=active" );
1182       UFDB_API_num_safesearch++;
1183       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1184    }
1185    else
1186    if ((domain[0] <= '9' && domain[0] >= '0')     &&			/* google.com, google.de, google.ws etc. */
1187        strncmp( strippedURL, "/search", 7 ) == 0  &&
1188        strstr( strippedURL, "q=" ) != NULL        &&
1189        (strncmp( domain, "74.125.", 7 ) == 0  ||
1190         strncmp( domain, "173.194.", 8 ) == 0))
1191    {
1192       strcat( originalURL, "&safe=active" );
1193       UFDB_API_num_safesearch++;
1194       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1195    }
1196    else
1197    if ((strstr( domain, "google." ) != NULL  ||				/* SAFESEARCH: google.* */
1198         strcmp( domain, "ajax.googleapis.com" ) == 0  ||
1199         strstr( domain, "googleusercontent.com" ) != NULL)  &&		/* Google */
1200        strstr( strippedURL, "q=" ) != NULL  &&
1201        ((strncmp( strippedURL, "/insights", 9 ) != 0  &&  strstr( strippedURL, "/search" ) != NULL)  ||
1202         strstr( strippedURL, "/uds/afs" ) != NULL ||
1203         strstr( strippedURL, "/uds/gwebsearch" ) != NULL ||
1204         strstr( strippedURL, "/uds/gvideosearch" ) != NULL ||
1205         strstr( strippedURL, "/uds/gimagesearch" ) != NULL ||
1206         strstr( strippedURL, "/uds/gblogsearch" ) != NULL ||
1207         strstr( strippedURL, "/videosearch" ) != NULL ||
1208         strstr( strippedURL, "/blogsearch" ) != NULL ||
1209         strstr( strippedURL, "/gwebsearch" ) != NULL ||
1210         strstr( strippedURL, "/groups" ) != NULL ||
1211         strstr( strippedURL, "/cse" ) != NULL ||
1212         strstr( strippedURL, "/products" ) != NULL ||
1213         strstr( strippedURL, "/images" ) != NULL ||
1214         strstr( strippedURL, "/custom" ) != NULL) )
1215    {
1216       char * safe;
1217       /* search for 'safe=off' and replace by 'safe=active' */
1218       safe = strstr( originalURL, "&safe=off" );
1219       if (safe != NULL)
1220       {
1221          safe += 6;
1222 	 *safe++ = 'a';		/* 'o' */
1223 	 *safe++ = 'c';		/* 'f' */
1224 	 *safe++ = 't';		/* 'f' */
1225 	 (void) memmove( safe+3, safe, strlen(safe)+1 );
1226 	 *safe++ = 'i';
1227 	 *safe++ = 'v';
1228 	 *safe   = 'e';
1229       }
1230       strcat( originalURL, "&safe=active&safeui=on" );
1231       UFDB_API_num_safesearch++;
1232       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1233    }
1234    else
1235    if (strstr( domain, "webmenu.com" ) != NULL  &&		/* SAFESEARCH: webmenu.com */
1236        (strstr( strippedURL, "q_or=") != NULL  ||
1237         strstr( strippedURL, "q_and=") != NULL  ||
1238         strstr( strippedURL, "ss=") != NULL  ||
1239         strstr( strippedURL, "keyword=") != NULL  ||
1240 	strstr( strippedURL, "query=") != NULL) )
1241    {
1242       char * p;
1243       /* TODO: fix problem of cookie override; a user can set preferences to turn the filter OFF
1244        * in the user preferences.
1245        */
1246       while ((p = strstr( originalURL, "&ss=n" )) != NULL)
1247          *(p+4) = 'y';
1248       strcat( originalURL, "&ss=y" );
1249       UFDB_API_num_safesearch++;
1250       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1251    }
1252    else
1253    if (strstr( domain, "blekko.com" ) != NULL &&		/* SAFESEARCH: blekko.com */
1254        strncmp( strippedURL, "/ws/", 4 ) == 0)
1255    {
1256       if (strchr( strippedURL, '?' ) == NULL)
1257 	 strcat( originalURL, "?safesearch=2" );
1258       else
1259 	 strcat( originalURL, "&safesearch=2" );
1260       UFDB_API_num_safesearch++;
1261       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1262    }
1263    else
1264    if (strstr( domain, "izito." ) != NULL  &&			/* SAFESEARCH: izito.* */
1265        (strstr( strippedURL, "query=" ) != NULL  ||
1266         strstr( strippedURL, "q=" ) != NULL))
1267    {
1268       strcat( originalURL, "&ss=y" );
1269       UFDB_API_num_safesearch++;
1270       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1271    }
1272    else
1273    if (strstr( domain, "zapmeta." ) != NULL  &&			/* SAFESEARCH: zapmeta.* */
1274        strstr( strippedURL, "vid=" ) != NULL  &&
1275        strstr( strippedURL, "q=" ) != NULL)
1276    {
1277       strcat( originalURL, "&ss=y" );
1278       UFDB_API_num_safesearch++;
1279       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1280    }
1281    else
1282    if (strstr( domain, "bing.com" ) != NULL  &&			/* SAFESEARCH: bing. */
1283        strstr( strippedURL, "q=" ) != NULL)    			/* bing */
1284    {
1285       strcat( originalURL, "&ADLT=STRICT&filt=all" );
1286       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1287    }
1288    else
1289    if (strstr( domain, "bing.co.uk" ) != NULL  &&		/* SAFESEARCH: bing.co.uk */
1290        strstr( strippedURL, "q=" ) != NULL)    			/* bing */
1291    {
1292       strcat( originalURL, "&ADLT=STRICT&filt=all" );
1293       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1294    }
1295    else
1296    if (strcmp( domain, "api.bing.net" ) == 0  &&		/* Safesearch: bing API */
1297        strncmp( strippedURL, "/json.aspx", 10 ) == 0  &&	/* called by searchgby.com */
1298        strstr( strippedURL, "query=" ) != NULL)
1299    {
1300       strcat( originalURL, "&Adult=Strict" );
1301       UFDB_API_num_safesearch++;
1302       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1303    }
1304    else
1305    if (strcmp( domain, "search.searchcompletion.com" ) == 0  &&		/* SAFESEARCH: searchcompletion.com */
1306        strncmp( strippedURL, "/localsearchresults.aspx", 10 ) == 0  &&		/* search.searchcompletion.com/LocalSearchResults.aspx */
1307        strstr( strippedURL, "q=" ) != NULL)
1308    {
1309       strcat( originalURL, "&safe=on" );			/* TO-DO: fix this */
1310       UFDB_API_num_safesearch++;
1311       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1312    }
1313    else
1314    if (strstr( domain, "pageset.com" ) != NULL  &&		/* pageset.com */
1315        strstr( strippedURL, "q=" ) != NULL)
1316    {
1317       char * t;
1318       t = strstr( strippedURL, "adt=1" );
1319       if (t != NULL)
1320          *(t+4) = '0';
1321       else
1322 	 strcat( originalURL, "&adt=0" );
1323       UFDB_API_num_safesearch++;
1324       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1325    }
1326    else
1327    if (strstr( domain, "trovator.com" ) != NULL  &&		/* searchcompletion.com trovator.com */
1328        strstr( strippedURL, "q=" ) != NULL)
1329    {
1330       strcat( originalURL, "&fil=si" );
1331       UFDB_API_num_safesearch++;
1332       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1333    }
1334    else
1335    if (strcmp( domain, "results.searchlock.com" ) == 0  &&      /* SAFESEARCH: searchlock.com */
1336        strstr( strippedURL, "q=") != NULL)
1337    {
1338       strcat( originalURL, "&sf=1" );
1339       UFDB_API_num_safesearch++;
1340       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1341    }
1342    else
1343    if (strstr( domain, ".yauba.com" ) != NULL  &&		/* SAFESEARCH: yauba.com */
1344        strstr( strippedURL, "query=") != NULL)
1345    {
1346       strcat( originalURL, "&ss=y" );
1347       UFDB_API_num_safesearch++;
1348       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1349    }
1350    else
1351    if (strstr( domain, "forestle.org" ) != NULL  &&		/* SAFESEARCH: forestle.org */
1352        (strstr( strippedURL, "settings") != NULL  ||
1353         strstr( strippedURL, "q=") != NULL))
1354    {
1355       strcat( originalURL, "&adultfilter=noadult" );
1356       UFDB_API_num_safesearch++;
1357       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1358    }
1359    else
1360    if (strstr( domain, "zombol.com" ) != NULL  &&		/* SAFESEARCH: zombol.com */
1361        strstr( strippedURL, "/results") != NULL  &&
1362        strstr( strippedURL, "q=") != NULL)
1363    {
1364       strcat( originalURL, "&safe=active" );
1365       UFDB_API_num_safesearch++;
1366       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1367    }
1368    else
1369    if (strstr( domain, "kalooga.com" ) != NULL  &&		/* SAFESEARCH: kalooga.com */
1370        strstr( strippedURL, "search") != NULL  &&
1371        strstr( strippedURL, "query=") != NULL)
1372    {
1373       strcat( originalURL, "&filter=default" );
1374       UFDB_API_num_safesearch++;
1375       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1376    }
1377    else
1378    if (strstr( domain, "muuler.com" ) != NULL  &&		/* SAFESEARCH: muuler.com */
1379        strstr( strippedURL, "/result") != NULL  &&
1380        strstr( strippedURL, "q=") != NULL)
1381    {
1382       strcat( originalURL, "&safe=active" );
1383       UFDB_API_num_safesearch++;
1384       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1385    }
1386    else
1387    if (strstr( domain, "foozir.com" ) != NULL  &&		/* SAFESEARCH: foozir.com */
1388        strstr( strippedURL, "/result") != NULL  &&
1389        strstr( strippedURL, "q=") != NULL)
1390    {
1391       strcat( originalURL, "&safe=active" );
1392       UFDB_API_num_safesearch++;
1393       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1394    }
1395    else
1396    if (strstr( domain, "moons.it" ) != NULL  &&			/* SAFESEARCH: moons.it */
1397        strstr( strippedURL, "/ricerca") != NULL  &&
1398        strstr( strippedURL, "q=") != NULL)
1399    {
1400       strcat( originalURL, "&safe=active" );
1401       UFDB_API_num_safesearch++;
1402       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1403    }
1404    else
1405    if (strstr( domain, "wotbox.com" ) != NULL  &&		/* SAFESEARCH: wotbox.com */
1406        (strstr( strippedURL, "q=") != NULL  ||
1407         strstr( strippedURL, "op0=") != NULL) )
1408    {
1409       strcat( originalURL, "&a=true" );
1410       UFDB_API_num_safesearch++;
1411       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1412    }
1413    else
1414    if (strstr( domain, "ant.com" ) != NULL  &&			/* SAFESEARCH: ant.com */
1415        strstr( strippedURL, "antq=") != NULL)
1416    {
1417       strcat( originalURL, "&safe=1" );
1418       UFDB_API_num_safesearch++;
1419       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1420    }
1421    else
1422    if ((strstr( domain, "duck.co" ) != NULL  ||			/* SAFESEARCH: duck.co */
1423         strstr( domain, "duckduckgo.org" ) != NULL  ||          /* SAFESEARCH: duckduckgo.org  */
1424         strstr( domain, "duckduckgo.com" ) != NULL)  &&         /* SAFESEARCH: duckduckgo.com  */
1425        strstr( strippedURL, "q=") != NULL)
1426    {
1427       strcat( originalURL, "&kp=1" );
1428       UFDB_API_num_safesearch++;
1429       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1430    }
1431    else
1432    if ((strstr( domain, "qbyrd.com" ) != NULL  ||  		/* SAFESEARCH: qbyrd.com */
1433         strstr( domain, "search-results.com" ) != NULL)  &&	/* SAFESEARCH: search-results.com */
1434        strstr( strippedURL, "q=") != NULL)
1435    {
1436       char * adt;
1437       adt = strstr( originalURL, "adt=1" );
1438       if (adt != NULL)
1439          *(adt+4) = '0';
1440       else
1441 	 strcat( originalURL, "&adt=0" );
1442       UFDB_API_num_safesearch++;
1443       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1444    }
1445    else
1446    if (strstr( domain, "easysearch.org.uk" ) != NULL  &&	/* SAFESEARCH: easysearch.org.uk */
1447        strstr( strippedURL, "search") != NULL)
1448    {
1449       strcat( originalURL, "&safe=on" );
1450       UFDB_API_num_safesearch++;
1451       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1452    }
1453    else
1454    if (strstr( domain, "ecosia.org" ) != NULL  &&
1455        strstr( strippedURL, "q=" ) != NULL)    			/* SAFESEARCH: ecosia.org */
1456    {
1457       strcat( originalURL, "&safeSearch:1" );
1458       UFDB_API_num_safesearch++;
1459       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1460    }
1461    else
1462    if (strstr( domain, "ask.com" ) != NULL  &&
1463        strchr( strippedURL, '?' ) != NULL)    			/* SAFESEARCH: ask.com */
1464    {
1465       strcat( originalURL, "&adt=0" );
1466       UFDB_API_num_safesearch++;
1467       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1468    }
1469    else
1470    if (strncmp( domain, "api.search.yahoo.", 17 ) == 0  &&	/* SAFESEARCH: API yahoo.* */
1471        strstr( strippedURL, "query=" ) != NULL)
1472    {
1473       strcat( originalURL, "&adult_ok=0" );
1474       UFDB_API_num_safesearch++;
1475       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1476    }
1477    else
1478    if ((strcmp( domain, "search.aol.com" ) == 0  ||
1479         strstr( domain, ".aolsearch.com" ) != NULL)  &&		/* Safesearch: AOL */
1480        strncmp( strippedURL, "/search", 7 ) == 0  &&
1481        strstr( strippedURL, "q=" ) != NULL)
1482    {
1483       strcat( originalURL, "&safesearch=1&sp_ss=1" );
1484       UFDB_API_num_safesearch++;
1485       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1486    }
1487    else
1488    if ((strstr( domain, ".terra." ) != NULL  &&  		/* SAFESEARCH: terra.* */
1489         strstr( domain, "busca" ) != NULL)  &&
1490        (strstr( strippedURL, "query=" ) != NULL  ||
1491         strstr( strippedURL, "source=" ) != NULL) )	  /* .ar .br .cl .co .ec .es */
1492    {
1493       strcat( originalURL, "&npl=%26safe%3dhigh" );
1494       UFDB_API_num_safesearch++;
1495       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1496    }
1497    else
1498    if (strcmp( domain, "search.alot.com" ) == 0  &&		/* SAFESEARCH: alot.com */
1499        strstr( strippedURL, "q=" ) != NULL)
1500    {
1501       strcat( originalURL, "&f=1" );
1502       UFDB_API_num_safesearch++;
1503       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1504    }
1505    else
1506    if (strstr( domain, "searchalot.com" ) != NULL  &&		/* SAFESEARCH: searchalot.com */
1507        strstr( strippedURL, "q=" ) != NULL)
1508    {
1509       strcat( originalURL, "&safesearch=high" );
1510       UFDB_API_num_safesearch++;
1511       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1512    }
1513    else
1514    if (strstr( domain, "alltheinternet.com" ) != NULL  &&	/* SAFESEARCH: alltheinternet.com */
1515        strstr( strippedURL, "q=" ) != NULL)
1516    {
1517       strcat( originalURL, "&safesearch=high" );
1518       UFDB_API_num_safesearch++;
1519       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1520    }
1521    else
1522    if (strstr( domain, "search.yahoo." ) != NULL  &&		/* SAFESEARCH: yahoo.* */
1523        strstr( strippedURL, "p=" ) != NULL)
1524    {
1525       strcat( originalURL, "&vm=r" );
1526       /* TODO: investigate http://www.yahoo.com/r/sx/ *-http://search.yahoo.com/search */
1527       UFDB_API_num_safesearch++;
1528       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1529    }
1530    else
1531    if (strstr( domain, "inspsearchapi.com" ) != NULL  &&
1532        strncmp( strippedURL, "/search", 7 ) == 0)
1533    {
1534       strcat( originalURL, "&family-friendly=on" );
1535       UFDB_API_num_safesearch++;
1536       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1537    }
1538    else
1539    if (strstr( domain, "excite." ) != NULL  &&			/* SAFESEARCH: excite.* */
1540        strstr( strippedURL, "search" ) != NULL  &&
1541        strchr( strippedURL, '?' ) != NULL)  			/* Excite */
1542    {
1543       strcat( originalURL, "&familyfilter=1&splash=filtered" );
1544       UFDB_API_num_safesearch++;
1545       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1546    }
1547    else
1548    if (strncmp( domain, "search.msn.", 11 ) == 0)		/* SAFESEARCH: msn.* */
1549    {
1550       if (slash == NULL)
1551          strcat( originalURL, "/" );
1552       strcat( originalURL, "&adlt=strict" );
1553       UFDB_API_num_safesearch++;
1554       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1555    }
1556    else
1557    if (strncmp( domain, "search.live.", 12 ) == 0  &&		/* SAFESEARCH: live.* */
1558        strstr( strippedURL, "q=" ) != NULL)
1559    {
1560       strcat( originalURL, "&adlt=strict" );
1561       UFDB_API_num_safesearch++;
1562       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1563    }
1564    else
1565    if (strcmp( domain, "api.search.live.net" ) == 0  &&		/* Safesearch: live API */
1566        strstr( strippedURL, "sources=" ) != NULL)
1567    {
1568       strcat( originalURL, "&adlt=strict" );
1569       UFDB_API_num_safesearch++;
1570       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1571    }
1572    else
1573    if (strstr( domain, "blinkx.com" ) != NULL  &&		/* SAFESEARCH: blinkx.com */
1574        strchr( strippedURL, '?' ) != NULL)
1575    {
1576       strcat( originalURL, "&safefilter=on" );
1577       UFDB_API_num_safesearch++;
1578       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1579    }
1580    else
1581    if (strcmp( domain, "etools.ch" ) == 0  &&
1582        strncmp( strippedURL, "/search", 7 ) == 0)
1583    {
1584       strcat( originalURL, "&safeSearch=true" );
1585       UFDB_API_num_safesearch++;
1586       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1587    }
1588    else
1589    if (strncmp( domain, "search.lycos.", 13 ) == 0)		/* SAFESEARCH: lycos.* */
1590    {
1591       if (slash == NULL)
1592          strcat( originalURL, "/" );
1593       strcat( originalURL, "&contentFilter=strict&family=on" );
1594       UFDB_API_num_safesearch++;
1595       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1596    }
1597    else
1598    if (strstr( domain, "dogpile.com" ) != NULL  ||		/* SAFESEARCH: dogpile.com */
1599        strstr( domain, "dogpile.co.uk" ) != NULL)		/* SAFESEARCH: dogpile.co.uk  */
1600    {
1601       if (slash == NULL)
1602          strcat( originalURL, "/" );
1603       strcat( originalURL, "&adultfilter=heavy" );
1604       UFDB_API_num_safesearch++;
1605       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1606    }
1607    else
1608    if (strstr( domain, "infospace.com" ) != NULL)		/* SAFESEARCH: infospace.com */
1609    {
1610       if (slash == NULL)
1611          strcat( originalURL, "/" );
1612       strcat( originalURL, "&familyfilter=1" );
1613       UFDB_API_num_safesearch++;
1614       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1615    }
1616    else
1617    if (strstr( domain, "metacrawler.com" ) != NULL)		/* SAFESEARCH: metacrawler.com */
1618    {
1619       if (slash == NULL)
1620          strcat( originalURL, "/" );
1621       strcat( originalURL, "&familyfilter=1" );
1622       UFDB_API_num_safesearch++;
1623       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1624    }
1625    else
1626    if (strstr( domain, "webfetch.com" ) != NULL  ||		/* SAFESEARCH: webfetch.com */
1627        strstr( domain, "webfetch.co.uk" ) != NULL)		/* SAFESEARCH: webfetch.co.uk */
1628    {
1629       if (slash == NULL)
1630          strcat( originalURL, "/" );
1631       strcat( originalURL, "&familyfilter=1" );
1632       UFDB_API_num_safesearch++;
1633       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1634    }
1635    else
1636    if (strstr( domain, "webcrawler.com" ) != NULL)		/* SAFESEARCH: webcrawler.com */
1637    {
1638       if (slash == NULL)
1639          strcat( originalURL, "/" );
1640       strcat( originalURL, "&familyfilter=1" );
1641       UFDB_API_num_safesearch++;
1642       return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1643    }
1644 
1645    return UFDB_API_OK;
1646 }
1647 
1648 
1649 #if 0
1650 UFDB_GCC_HOT UFDB_GCC_INLINE
1651 static int squeeze_html_char(
1652    char * p,
1653    int *  hex )
1654 {
1655    int    length;
1656 
1657    length = 0;
1658    *hex = 0;
1659    while (*p != '\0'  &&  isxdigit( (int) *p ))
1660    {
1661       int h;
1662       h = (*p <= '9') ? *p - '0' : *p - 'a' + 10;
1663       *hex = *hex * 16 + h;
1664       p++;
1665       length++;
1666    }
1667 
1668 #if 0
1669    ufdbLogMessage( "   squeeze_html_char hex=%04x  length=%d  *p=%c", *hex, length, *p );
1670 #endif
1671 
1672    if (*p != ';')
1673       return -1;		/* '&#xxx' without trailing ';' is not a valid HTML character */
1674 
1675    if (*hex == 0)
1676       return length;
1677 
1678    if (*hex < 0x0020)
1679    {
1680       if (*hex != '\t'  &&  *hex != '\n'  &&  *hex != '\r'  &&  *hex != '\f')
1681 	 *hex = ' ';
1682    }
1683    else if (*hex == 0x007f  ||  *hex >= 0x00ff)
1684    {
1685       *hex = ' ';
1686    }
1687    else if (*hex <= 'Z'  &&  *hex >= 'A')
1688    {
1689       *hex += 'a' - 'A';
1690    }
1691 
1692    return length;
1693 }
1694 #endif
1695 
1696 
1697 UFDB_GCC_INLINE  UFDB_GCC_HOT
increment_UFDB_API_num_url_lookups(void)1698 static void increment_UFDB_API_num_url_lookups( void )
1699 {
1700 #if 0
1701    UFDB_API_num_url_lookups++;  // do not use __sync_add_and_fetch()
1702 #elif defined(__GNUC__)  &&  __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4  &&  __SIZEOF_LONG__ == 4
1703    (void) __sync_add_and_fetch( &UFDB_API_num_url_lookups, 1 );
1704 #elif defined(__GNUC__)  &&  __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8  &&  __SIZEOF_LONG__ == 8
1705    (void) __sync_add_and_fetch( &UFDB_API_num_url_lookups, 1 );
1706 #else
1707    UFDB_SHARED static ufdb_mutex incrMutex = ufdb_mutex_initializer;
1708    ufdb_mutex_lock( &incrMutex );
1709    UFDB_API_num_url_lookups++;
1710    ufdb_mutex_unlock( &incrMutex );
1711 #endif
1712 }
1713 
1714 
1715 UFDB_GCC_HOT
increment_UFDB_API_num_https(void)1716 void increment_UFDB_API_num_https( void )
1717 {
1718 #if 0
1719    UFDB_API_num_https++;  // do not use __sync_add_and_fetch()
1720 #elif defined(__GNUC__)  &&  __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4  &&  __SIZEOF_LONG__ == 4
1721    (void) __sync_add_and_fetch( &UFDB_API_num_https, 1 );
1722 #elif defined(__GNUC__)  &&  __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8  &&  __SIZEOF_LONG__ == 8
1723    (void) __sync_add_and_fetch( &UFDB_API_num_https, 1 );
1724 #else
1725    UFDB_SHARED static ufdb_mutex incrMutex = ufdb_mutex_initializer;
1726    ufdb_mutex_lock( &incrMutex );
1727    UFDB_API_num_https++;
1728    ufdb_mutex_unlock( &incrMutex );
1729 #endif
1730 }
1731 
1732 
1733 /*
1734  * strip a URL:
1735  * remove http:// prefix,
1736  * remove www[0-9*]. prefix,
1737  * remove port number,
1738  * remove username and password,
1739  * remove IP address obfuscations (numbers with leading zeroes)
1740  * convert hex codes (%61 = 'a') to characters,
1741  * convert special characters to void or space,
1742  * convert characters to lower case.
1743  * substitute // by / in a URL
1744  * substitute /./ by / in a URL
1745  * substitute /foo/../bar by /bar in a URL
1746  * trim very long URLs (more than 512 letters)
1747  * do not remove "#foo"
1748  */
1749 UFDB_GCC_HOT
UFDBstripURL2(char * URL,int stripwwwprefix,char * strippedUrl,char * domain,char * protocol,int * portnumber)1750 void UFDBstripURL2(
1751    char * URL, 			/* input URL string */
1752    int    stripwwwprefix,	/* input flag for stripping "www." prefix from URL */
1753    char * strippedUrl,  	/* output char array (must be UFDB_MAX_URL_LENGTH bytes) */
1754    char * domain,       	/* output char array (must be 1024 bytes) */
1755    char * protocol,		/* output char array (must be 16 bytes) */
1756    int  * portnumber )		/* output integer */
1757 {
1758    char * p;
1759    char * tmp;
1760    char * domain_start;
1761    char * domain_end;
1762    char * optional_token;
1763    char * origStrippedUrl;
1764    int    port;
1765    int    is_ip_address;
1766    int    is_ipv6;
1767    int    obfuscated;
1768    int    insideParams;
1769    char   buffer[UFDB_MAX_URL_LENGTH];
1770 
1771    /*
1772     * This parser has the following parsing stages:
1773     * 1) parse protocol and save it (output).
1774     * 2) skip optional username:password@ and initial "www[nnn]."
1775     * 3) parse FQDN, stop at ':' (port number) or any of / ; ? # (start of path)
1776     *    if the FQDN starts with '[' the URL has an IPv6 and parsing is done by separate function.
1777     * 4) store optional port (output) (default is 80, or 443 if protocol is https)
1778     *    URL without protocol, username/password, port is now stored in temporary buffer 'buffer'
1779     *    NEW in v1.32: the parameters of the URL are also copied to the buffer.
1780     * 5) The FDQN is saved in domainname (output).
1781     * 6) The URL buffer is converted to lowercase but parameters are not converted.
1782     * 7) Check for obfuscated IP addresses and normalise them
1783     * 8) Copy/convert the buffer to strippedURL (output) converting %xx and &#xx; and // and /./ and /../
1784     */
1785 
1786    *portnumber = 80;
1787    is_ipv6 = 0;
1788 
1789    increment_UFDB_API_num_url_lookups();
1790 
1791    /* strip http:// and ftp:// protocol header */
1792    p = findProtocolEnd( URL );
1793    if (p != NULL)
1794    {
1795       int n;
1796       n = p - URL;
1797       if (n == 0 || n > 14)
1798       {
1799 	 /* ERROR: an absent or very large protocol name... The URL does not start with a valid protocol.
1800 	  */
1801       	 strcpy( protocol, "http" );
1802 	 p = URL;
1803       }
1804       else
1805       {
1806 	 memcpy( protocol, URL, n );
1807 	 protocol[n] = '\0';
1808 	 if (n == 5  &&  strcasecmp( protocol, "https" ) == 0)
1809 	 {
1810 	    increment_UFDB_API_num_https();
1811 	    *portnumber = 443;
1812 	 }
1813 	 p += 3;			/* skip '://' */
1814       }
1815    }
1816    else
1817    {
1818       strcpy( protocol, "http" );
1819       p = URL;
1820    }
1821 
1822    domain_end = findDomainEnd( p );				/* might not be accurate and skipped the ':port' */
1823 
1824    optional_token = strchr_before( p, domain_end, '@' );	/* strip user:password@ */
1825    if (optional_token != NULL)
1826       p = optional_token + 1;
1827 
1828    domain_start = p;
1829 
1830 #if 0
1831    if (ufdbGV.debug)
1832       ufdbLogMessage( "   UFDBstripURL2: p: %s\n", p );
1833 #endif
1834 
1835    if (*p == '[')			/* IPv6 URL: http://[::1]:80/index.html */
1836    {
1837       char * end;
1838       char * oldBracket;
1839 
1840       is_ipv6 = 1;
1841       oldBracket = strchr( p, ']' );
1842       if (oldBracket != NULL)
1843          *oldBracket = '\0';
1844       end = UFDBparseIPv6address( p, domain );
1845       if (oldBracket != NULL)
1846          *oldBracket = ']';
1847       if (end != NULL)
1848       {
1849 	 UFDBupdateURLwithNormalisedDomain( p, domain );
1850 	 /* uh-oh: the normalised domain is usually smaller and our pointers have moved */
1851 	 domain_end = findDomainEnd( p );
1852 	 oldBracket = strchr( p, ']' );
1853 	 if (oldBracket == NULL)
1854 	    oldBracket = domain_end - 1;
1855 
1856 	 optional_token = strchr_before( oldBracket, domain_end, ':' );
1857 #if 0
1858 	 if (ufdbGV.debug > 1)
1859 	    ufdbLogMessage( "    UFDBstripURL2:  domain_end: %08x oldBracket: %08x  token: %08x\n",
1860 		            domain_end, oldBracket, optional_token );
1861 #endif
1862       }
1863       else
1864 	 optional_token = NULL;
1865    }
1866    else
1867    {
1868       if (stripwwwprefix)					    /* strip www[0-9]{0,2}. */
1869       {
1870 	 if ((p[0] == 'w' || p[0] == 'W')  &&
1871 	     (p[1] == 'w' || p[1] == 'W')  &&
1872 	     (p[2] == 'w' || p[2] == 'W'))
1873 	 {
1874 	    tmp = p + 3;
1875 	    if (*tmp <= '9'  &&  *tmp >= '0')
1876 	       tmp++;
1877 	    if (*tmp <= '9'  &&  *tmp >= '0')
1878 	       tmp++;
1879 	    if (*tmp == '.'  &&  strchr_before( tmp+1, domain_end, '.' ) != NULL)
1880 	       p = tmp + 1;
1881 	 }
1882       }
1883       optional_token = strchr_before( p, domain_end, ':' );
1884    }
1885 
1886    								/* parse-and-strip ":<portnum>" */
1887    tmp = buffer;
1888    if (optional_token != NULL)
1889    {                                                            /* copy domain name */
1890       while (p < optional_token)
1891          *tmp++ = *p++;
1892       *tmp = '\0';
1893 
1894       p++;
1895       port = 0;
1896       while (*p <= '9'  &&  *p >= '0')
1897       {
1898 	 port = port * 10 + (*p - '0');
1899          p++;
1900       }
1901 
1902       if (port == 443  &&  *portnumber != 443)			/* Squid sends "example.com:443" with CONNECT */
1903       {
1904 	 increment_UFDB_API_num_https();
1905          strcpy( protocol, "https" );
1906       }
1907 
1908       *portnumber = port;
1909       ufdbStrncpy( tmp, p, UFDB_MAX_URL_LENGTH-256-6-1 );	/* copy rest of the URL */
1910    }
1911    while (1)
1912    {
1913       if (*p == '\0'  ||  *p == '#') break;
1914       *tmp++ = *p++;
1915 
1916       if (*p == '\0'  ||  *p == '#') break;
1917       *tmp++ = *p++;
1918 
1919       if (*p == '\0'  ||  *p == '#') break;
1920       *tmp++ = *p++;
1921 
1922       if (*p == '\0'  ||  *p == '#') break;
1923       *tmp++ = *p++;
1924 
1925       if (tmp >= &buffer[UFDB_MAX_URL_LENGTH-2-4])
1926          break;
1927    }
1928    *tmp = '\0';
1929 
1930    if (!is_ipv6)				/* save the original domainname */
1931    {
1932       int n;
1933 
1934       if (optional_token != NULL)
1935 	 domain_end = optional_token;
1936       n = domain_end - domain_start;
1937       if (n >= 1023)
1938       {
1939          strcpy( domain, "domaintoolong.urlfilterdb.com" );
1940       }
1941       else
1942       {
1943          memcpy( domain, domain_start, n );
1944          domain[n] = '\0';
1945       }
1946    }
1947 
1948    /*
1949     * Now a temporary URL is in 'buffer'.
1950     * The temporary URL has no protocol, portnum, username/password, initial "www[nnn].".
1951     * Convert URL to lower case but stop at the first '?' to leave parameters untouched.
1952     */
1953    tmp = buffer;
1954    while (*tmp != '\0')
1955    {
1956       if (*tmp == '?')
1957       {
1958          if (!ufdbGV.parseURLparameters)            // UFDBstripURL2(): parse parameters or not?
1959             *tmp = '\0';
1960          goto stop_lowercasing;
1961       }
1962       if (*tmp <= 'Z'  &&  *tmp >= 'A')
1963 	 *tmp += 'a' - 'A';
1964       tmp++;
1965    }
1966    *tmp++ = '\0';				/* prevent problems with % at end of URL */
1967    *tmp = '\0';
1968 stop_lowercasing:
1969 
1970 #if 0
1971    if (ufdbGV.debug)
1972       ufdbLogMessage( "   UFDBstripURL2: after lowercasing: %s\n", buffer );
1973 #endif
1974 
1975    /* scan for IP address obfuscations */
1976    obfuscated = 0;
1977    is_ip_address = 1;
1978    for (tmp = domain;  *tmp != '\0';  )
1979    {
1980       if (*tmp == '0')
1981 	 obfuscated = 1;
1982       if (*tmp == '0' && *(tmp+1) == 'x')				/* parse 1 hex octet 0xHH */
1983       {
1984          tmp += 2;
1985 	 while (*tmp != '\0'  &&  isxdigit((int) *tmp))
1986 	    tmp++;
1987 	 if (*tmp != '\0'  &&  *tmp != '.')
1988 	 {
1989 	    is_ip_address = 0;
1990 	    break;
1991 	 }
1992       }
1993       else if (*tmp <= '9' && *tmp >= '0')
1994       {
1995 	 while (*tmp != '\0'  &&  (*tmp <= '9' && *tmp >= '0'))		/* parse 1 octal or integer octet */
1996 	    tmp++;
1997 	 if (*tmp != '\0'  &&  *tmp != '.')
1998 	 {
1999 	    is_ip_address = 0;
2000 	    break;
2001 	 }
2002       }
2003       else
2004       {
2005 	 is_ip_address = 0;
2006 	 break;
2007       }
2008       if (*tmp == '.')
2009          tmp++;
2010    }
2011    if (is_ip_address  &&  obfuscated)
2012       removeObfuscations( domain, buffer );
2013 
2014    /*
2015     *  Copy the buffer to strippedUrl, while converting hex codes to characters.
2016     *  After the first '?' we only do %HH character conversion
2017     */
2018    insideParams = 0;
2019    origStrippedUrl = strippedUrl;
2020    p = buffer;
2021    while (*p != '\0')
2022    {
2023       if (*p == ':'  &&  *(p+1) == '/'  &&  *(p+2) == '/')		/* do not replace :// by :/  */
2024       {
2025          *strippedUrl++ = *p++;
2026          *strippedUrl++ = *p++;
2027          *strippedUrl++ = *p++;
2028       }
2029       else if (*p == '%')				/* start of a HEX code */
2030       {
2031          if (isxdigit((int) *(p+1)) && isxdigit((int) *(p+2)))
2032 	 {
2033 	    char   h;
2034 	    int    hex;
2035 
2036 	    h = *(p+1);
2037             if (h <= '9')
2038                hex = (h - '0') * 16;
2039             else if (h <= 'F')
2040                hex = (h - 'A' + 10) * 16;
2041             else
2042                hex = (h - 'a' + 10) * 16;
2043 	    h = *(p+2);
2044             if (h <= '9')
2045                hex += (h - '0');
2046             else if (h <= 'F')
2047                hex += (h - 'A' + 10);
2048             else
2049                hex += (h - 'a' + 10);
2050 	    /* be careful with control characters */
2051 	    if (hex < 0x20)
2052 	    {
2053 	       if (hex == 0)
2054 	       {
2055 	          p += 3;
2056 		  continue;
2057 	       }
2058                hex = ' ';
2059 	       *strippedUrl++ = hex;
2060 	       p += 3;
2061 	    }
2062 	    else
2063 	    {
2064 	       if (!insideParams  &&  hex <= 'Z'  &&  hex >= 'A')
2065 		  hex += 'a' - 'A';
2066 	       else if (hex == 0x7f)
2067 	          hex = ' ';
2068 
2069 	       *strippedUrl++ = hex;
2070 	       p += 3;
2071                if (hex == ':'  &&  *(p) == '/'  &&  *(p+1) == '/')     // do not replace :// by :/
2072                {
2073                   *strippedUrl++ = '/';
2074                   *strippedUrl++ = '/';
2075                   p += 2;
2076                }
2077 	    }
2078 	 }
2079 	 else 					/* erroneous code */
2080 	 {
2081 	    *strippedUrl++ = *p++;		/* just copy the '%' */
2082 	 }
2083       }
2084       else					/* plain character */
2085       {
2086 	 while (*p == '/')
2087 	 {
2088 	    if (*(p+1) == '/')					/* substitute // by /    but not in "xxx://" */
2089 	       p++;
2090 	    else if (*(p+1) == '.'  && *(p+2) == '/')		/* substitute /./ by / */
2091 	       p += 2;
2092 	    else if (*(p+1) == '.'  &&  *(p+2) == '.'  &&  *(p+3) == '/')    /* substitute /xxxx/../ by / */
2093 	    {
2094 	       /* try to find the previous directory... */
2095 	       char * tmp;
2096 	       tmp = strippedUrl - 1;
2097 	       while (*tmp != '/'  &&  tmp > origStrippedUrl)
2098 		  tmp--;
2099 	       if (tmp > origStrippedUrl)
2100 	       {
2101 		  strippedUrl = tmp;
2102 		  p += 3;
2103 	       }
2104 	       else
2105 		  break;
2106 	    }
2107 	    else
2108 	       break;
2109 	 }
2110 	 if (*p == '?')		/* just copy the '?' */
2111 	 {
2112             insideParams = 1;
2113             *strippedUrl++ = *p++;
2114 	 }
2115 	 else
2116 #if 0
2117          if (*p == '#')
2118          {
2119             break;
2120          }
2121          else
2122 #endif
2123 	    *strippedUrl++ = *p++;
2124       }
2125    }
2126    *strippedUrl++ = '\0';
2127    *strippedUrl = '\0';		/* sset2 requires double \0 termination */
2128 }
2129 
2130 
2131 UFDB_GCC_HOT
UFDBstripURL(char * URL,char * strippedUrl,char * domain,char * protocol,int * portnumber)2132 void UFDBstripURL(
2133    char * URL, 			/* input URL string */
2134    char * strippedUrl,  	/* output char array (must be UFDB_MAX_URL_LENGTH bytes) */
2135    char * domain,       	/* output char array (must be 1024 bytes) */
2136    char * protocol,		/* output char array (must be 16 bytes) */
2137    int  * portnumber )		/* output integer */
2138 {
2139    UFDBstripURL2( URL, 1, strippedUrl, domain, protocol, portnumber );
2140 }
2141 
2142 
UFDBprintable(char * string)2143 char * UFDBprintable( char * string )
2144 {
2145    char * p;
2146 
2147    if (string == NULL)
2148       return (char *) "NULL";
2149 
2150    p = string;
2151    while (*p != '\0')
2152    {
2153       if (*p < 32  ||  *p > 126)
2154          *p = '?';
2155       p++;
2156    }
2157 
2158    return string;
2159 }
2160 
2161 
2162 #if !UFDB_BARE_METAL_SUPPORT
ufdbGetSysInfo(struct utsname * si)2163 void ufdbGetSysInfo(
2164    struct utsname * si )
2165 {
2166    if (uname( si ) < 0)
2167    {
2168       strcpy( si->machine, "M?" );
2169       strcpy( si->release, "R?" );
2170       strcpy( si->nodename, "unknown" );
2171       strcpy( si->sysname, "sysname" );
2172    }
2173    else
2174    {
2175       si->machine[ sizeof(si->machine)-1 ] = '\0';
2176       si->release[ sizeof(si->release)-1 ] = '\0';
2177       si->sysname[ sizeof(si->sysname)-1 ] = '\0';
2178 
2179       (void) gethostname( si->nodename, sizeof(si->nodename) );
2180       si->nodename[ sizeof(si->nodename)-1 ] = '\0';
2181    }
2182 }
2183 
2184 
ufdbGetNumCPUs(void)2185 long ufdbGetNumCPUs( void )
2186 {
2187    long num_cpus;
2188 
2189 #if defined(_SC_NPROCESSORS_ONLN)
2190    num_cpus = sysconf( _SC_NPROCESSORS_ONLN );
2191 
2192 #elif defined(__NR_sched_getaffinity)
2193    /* sched_setaffinity() is buggy on linux 2.4.x so we use syscall() instead */
2194    cpu = syscall( __NR_sched_getaffinity, getpid(), 4, &cpu_mask );
2195    /* printf( "sched_getaffinity returned %d %08lx\n", cpu, cpu_mask ); */
2196    if (cpu >= 0)
2197    {
2198       num_cpus = 0;
2199       for (cpu = 0; cpu < 32; cpu++)
2200          if (cpu_mask & (1 << cpu))
2201             num_cpus++;
2202       /* printf( "   found %d CPUs in the cpu mask\n", num_cpus ); */
2203    }
2204    else
2205 #else
2206       num_cpus = 0;
2207 #endif
2208 
2209    return num_cpus;
2210 }
2211 #endif
2212 
2213 
UFDBcalcCksum(char * mem,long size)2214 int UFDBcalcCksum( char * mem, long size )
2215 {
2216    unsigned int cksum = 17;
2217 
2218    while (--size >= 0)
2219    {
2220       cksum = cksum * 13 + ((unsigned int) *mem++) * 3;
2221    }
2222    return (int) (cksum % 100000);
2223 }
2224 
2225 
2226 #if (__GNUC__ > 4)  ||  (__GNUC__ == 4  &&  __GNUC_MINOR__ >= 4)
2227 // #pragma GCC pop_options
2228 #endif
2229 
2230 
2231 #ifdef __cplusplus
2232 }
2233 #endif
2234