1 /*
2 * ufdblib.c - URLfilterDB
3 *
4 * ufdbGuard is copyrighted (C) 2005-2020 by URLfilterDB with all rights reserved.
5 *
6 * Parts of ufdbGuard are based on squidGuard.
7 * This module is NOT based on squidGuard.
8 *
9 * RCS $Id: ufdblib.c,v 1.171 2020/10/30 15:38:09 root Exp root $
10 */
11
12 /* This module is well tested and stable for a long time.
13 * For maximum performance _FORTIFY_SOURCE is undefined.
14 */
15 #undef _FORTIFY_SOURCE
16
17 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
18 #pragma GCC push_options
19 #pragma GCC optimize ("O3")
20 #endif
21
22 /* to inline string functions with gcc : */
23 #if defined(__OPTIMIZE__) && defined(__GNUC__) && defined(GCC_INLINE_STRING_FUNCTIONS_ARE_FASTER)
24 #define __USE_STRING_INLINES 1
25 #endif
26
27 #include "ufdb.h"
28 #include "ufdblib.h"
29 #include "ufdblocks.h"
30
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <strings.h>
34 #include <string.h>
35 #include <unistd.h>
36 #include <ctype.h>
37 #include <time.h>
38 #include <errno.h>
39 #include <sys/types.h>
40 #include <sys/time.h>
41
42 #if !UFDB_BARE_METAL_SUPPORT
43 #include <syslog.h>
44 /* TODO: evaluate use of syslog() */
45 #include <sys/stat.h>
46 #include <fcntl.h>
47 #include <poll.h>
48 #include <sys/socket.h>
49 #if HAVE_UNIX_SOCKETS
50 #include <sys/un.h>
51 #endif
52 #include <netdb.h>
53 #include <netinet/in.h>
54 #include <netinet/tcp.h>
55 #include <netinet/ip6.h>
56 // for inet_pton() and inet_ntop()
57 #include <arpa/inet.h>
58 #endif
59
60 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
61 #include "cvmx-rtc.h"
62 #endif
63
64 #ifdef __cplusplus
65 extern "C" {
66 #endif
67
68 #if 0
69 #define strmatch2(a,b) (strcmp(a,b) == 0)
70 #define strmatch3(a,b) (strcmp(a,b) == 0)
71 #define strmatch4(a,b) (strcmp(a,b) == 0)
72 #define strmatch5(a,b) (strcmp(a,b) == 0)
73 #define strmatch6(a,b) (strcmp(a,b) == 0)
74 #define strmatch7(a,b) (strcmp(a,b) == 0)
75 #define strmatch8(a,b) (strcmp(a,b) == 0)
76 #else
77 #define strmatch2(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == '\0')
78 #define strmatch3(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == '\0')
79 #define strmatch4(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == '\0')
80 #define strmatch5(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == '\0')
81 #define strmatch6(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == '\0')
82 #define strmatch7(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == (b)[6] && (a)[7] == '\0')
83 #define strmatch8(a,b) ((a)[0] == (b)[0] && (a)[1] == (b)[1] && (a)[2] == (b)[2] && (a)[3] == (b)[3] && (a)[4] == (b)[4] && (a)[5] == (b)[5] && (a)[6] == (b)[6] && (a)[7] == (b)[7] && (a)[8] == '\0')
84 #endif
85
86 /* #define UFDB_DEBUG_IPV6 */
87
88 /* #define UFDB_DO_DEBUG 1 */
89 #define UFDB_DO_DEBUG 0
90
91 #if UFDB_DO_DEBUG || 0
92 #define DEBUG(x) fprintf x
93 #else
94 #define DEBUG(x)
95 #endif
96
97 struct unknownURLentry;
98
99 #define UFDB_UUE_DOMAIN_LENGTH (128)
100 #define UFDB_UUE_PATH_LENGTH (128)
101 #define UFDB_UUE_REFERER_LENGTH (128 - sizeof(struct unknownURLentry *) - sizeof(long))
102
103 struct unknownURLentry
104 {
105 struct unknownURLentry * next;
106 long nhits;
107 char domain[UFDB_UUE_DOMAIN_LENGTH];
108 char firstpath[UFDB_UUE_PATH_LENGTH];
109 char referer[UFDB_UUE_REFERER_LENGTH];
110 };
111
112
113
114 UFDB_GCC_HOT
findDomainEnd(char * url)115 inline static char * findDomainEnd( char * url )
116 {
117 return url + strcspn( url, "/?&;#" );
118 }
119
120
121 UFDB_GCC_HOT
strchr_before(char * str,char * maxstr,char chr)122 inline static char * strchr_before( char * str, char * maxstr, char chr )
123 {
124 return (char *) memchr( str, chr, maxstr - str );
125 }
126
127
128 UFDB_GCC_HOT
findProtocolEnd(char * URL)129 inline static char * findProtocolEnd( char * URL )
130 {
131 int i;
132
133 i = strcspn( URL, ".:/?@#%" );
134 if (*(URL+i) == ':' && *(URL+i+1) == '/' && *(URL+i+2) == '/')
135 return URL + i;
136 return NULL;
137 }
138
139
UFDBappInit(void)140 void UFDBappInit( void )
141 {
142 ;
143 }
144
145
146 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
UFDBtimerInit(struct tms * t)147 void UFDBtimerInit( struct tms * t )
148 {
149 }
150
UFDBtimerStop(struct tms * t)151 void UFDBtimerStop( struct tms * t )
152 {
153 }
154
UFDBtimerPrintString(char * line,struct tms * t,const char * tag)155 void UFDBtimerPrintString( char * line, struct tms * t, const char * tag )
156 {
157 sprintf( line, "CPU time measurement not implemented on OCTEON" );
158 }
159
UFDBtimerPrint(struct tms * t,const char * tag)160 void UFDBtimerPrint( struct tms * t, const char * tag )
161 {
162 if (ufdbGV.debug)
163 ufdbLogMessage( "UFDBtimerPrint not supported on OCTEON" );
164 }
165
166 #else
167
UFDBtimerInit(struct tms * t)168 void UFDBtimerInit( struct tms * t )
169 {
170 (void) times( t );
171 }
172
173
UFDBtimerStop(struct tms * t)174 void UFDBtimerStop( struct tms * t )
175 {
176 struct tms te;
177
178 (void) times( &te );
179
180 t->tms_utime = te.tms_utime - t->tms_utime;
181 t->tms_stime = te.tms_stime - t->tms_stime;
182 t->tms_cutime = te.tms_cutime - t->tms_cutime;
183 t->tms_cstime = te.tms_cstime - t->tms_cstime;
184 }
185
186
187 UFDB_SHARED static double numTicks = 0.0;
188
UFDBtimerPrintString(char * line,struct tms * t,const char * tag)189 void UFDBtimerPrintString( char * line, struct tms * t, const char * tag )
190 {
191 if (numTicks == 0.0)
192 numTicks = (double) sysconf( _SC_CLK_TCK );
193
194 if (tag == NULL)
195 tag = "UFDB timer";
196
197 sprintf( line, "%s: %5.2f user %5.2f sys %5.2f total",
198 tag,
199 (double) t->tms_utime / numTicks,
200 (double) t->tms_stime / numTicks,
201 (double) (t->tms_utime+t->tms_stime) / numTicks );
202 }
203
204
UFDBtimerPrint(struct tms * t,const char * tag)205 void UFDBtimerPrint( struct tms * t, const char * tag )
206 {
207 char line[256];
208
209 UFDBtimerPrintString( line, t, tag );
210 ufdbLogMessage( "%s", line );
211 }
212 #endif
213
214
UFDBgettimeofday(struct timeval * tv,struct timezone * tz)215 int UFDBgettimeofday( struct timeval * tv, struct timezone * tz )
216 {
217 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
218 uint64_t ns = cvmx_read_csr( CVMX_MIO_PTP_CLOCK_HI );
219 if (ufdbGV.debug > 1)
220 ufdbLogMessage( "UFDBgettimeofday: CVMX_MIO_PTP_CLOCK_HI is %ld", ns );
221 tv->tv_sec = ns / (1000UL * 1000 * 1000);
222 tv->tv_usec = ns - (tv->tv_sec * (1000UL * 1000 * 1000));
223 return 0;
224 #else
225 return gettimeofday( tv, tz );
226 #endif
227 }
228
229
UFDBtime(void)230 time_t UFDBtime( void )
231 {
232 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
233 uint64_t ns = cvmx_read_csr( CVMX_MIO_PTP_CLOCK_HI );
234 if (ufdbGV.debug > 1)
235 ufdbLogMessage( "UFDBtime: CVMX_MIO_PTP_CLOCK_HI is %ld", ns );
236 return ns / (1000UL * 1000 * 1000);
237 // return (time_t) cvmx_rtc_read();
238 #else
239 return time( NULL );
240 #endif
241 }
242
243
UFDBhasRTC(void)244 int UFDBhasRTC( void )
245 {
246 #if UFDB_BARE_METAL_SUPPORT && __OCTEON__
247 return cvmx_rtc_supported() != 0;
248 #else
249 return 1;
250 #endif
251 }
252
253
removeObfuscations(char * domain,char * URL)254 static void removeObfuscations( char * domain, char * URL )
255 {
256 char * d;
257 char * s;
258 char * db;
259 int n, r;
260 int distance;
261 int oldDomainLength;
262 int newDomainLength;
263 int octetStart;
264 char newDomain[256];
265 char decimalBuf[16];
266
267 /*
268 * Rewrite the domain and URL in buffer to remove the obfuscation
269 * First put a normalised domain in newDomain,
270 * then copy back newDomain to domain and rewrite a new URL in buffer.
271 */
272 s = domain;
273 d = newDomain;
274
275 /* remove obfuscations: parse hex and octal numbers in the octects and rewrite them in decimal notation */
276 octetStart = 1;
277 while (*s != '\0')
278 {
279 if (*s == '0' && octetStart)
280 {
281 s++;
282 n = 0;
283
284 /* interpret octal or hex ? */
285 if (*s == 'x')
286 {
287 s++;
288 while (isxdigit((int) *s))
289 {
290 n = n * 16 + ((*s <= '9') ? (*s - '0') : (*s - 'a' + 10));
291 s++;
292 }
293 }
294 else if (isdigit((int) *s))
295 {
296 while (isdigit((int) *s))
297 {
298 n = n * 8 + (*s - '0');
299 s++;
300 }
301 }
302 else
303 {
304 ; /* it was a single '0' */
305 }
306
307
308 db = &decimalBuf[ sizeof(decimalBuf) - 1 ];
309 *db = '\0';
310 do /* write the number in decimal notation (from right to left) */
311 {
312 db--;
313 r = n % 10;
314 *db = r + '0';
315 n = n / 10;
316 } while (n > 0);
317 while (*db != '\0') /* copy the decimal number to the newDomain */
318 *d++ = *db++;
319 }
320 else
321 {
322 octetStart = (*s == '.');
323 *d++ = *s++;
324 }
325 }
326 *d = '\0';
327
328 oldDomainLength = s - domain;
329 newDomainLength = d - newDomain;
330
331 /* update the domain */
332 for (s = newDomain, d = domain; *s != '\0'; s++)
333 *d++ = *s;
334 *d = '\0';
335
336 /* update the URL */
337 if (newDomainLength < oldDomainLength)
338 {
339 s = domain;
340 d = URL;
341 while (*s != '\0')
342 *d++ = *s++;
343 /* the shorter new domain is copied to the URL; now move the path inside the URL */
344 distance = oldDomainLength - newDomainLength;
345 while (*(d+distance) != '\0')
346 {
347 *d = *(d+distance);
348 d++;
349 }
350 *d = '\0';
351 }
352 else if (newDomainLength == oldDomainLength)
353 {
354 s = domain;
355 d = URL;
356 while (*s != '\0')
357 *d++ = *s++;
358 /* the path does not need any updating */
359 }
360 else
361 {
362 /* move the path and then copy in the larger domain (extremely unlikely) */
363 int pathLength;
364 pathLength = strlen( URL + oldDomainLength );
365 memmove( URL+newDomainLength, URL+oldDomainLength, pathLength );
366
367 s = domain;
368 d = URL;
369 while (*s != '\0')
370 *d++ = *s++;
371 }
372 }
373
374
375 UFDB_GCC_ALIGN_CL
376 UFDB_SHARED static ufdb_mutex mutex_url_history = ufdb_mutex_initializer;
377
378 UFDB_GCC_ALIGN_CL
379 UFDB_SHARED static struct unknownURLentry * uues = NULL; // malloc-ed array with UFDB_UUE_MAXURLS structs
380 UFDB_SHARED static char * uue_list = NULL;
381 UFDB_SHARED static volatile int n_uue = -1;
382
383 UFDB_GCC_ALIGN_CL
384 UFDB_SHARED static struct unknownURLentry * uueHash[UFDB_UUE_HASHSIZE]; // TODO: also malloc ?
385
386 #define UUE_LIST_SIZE (UFDB_UUE_MAXURLS * \
387 (UFDB_UUE_DOMAIN_LENGTH + UFDB_UUE_PATH_LENGTH + 1 + 7 + 1 + \
388 UFDB_UUE_REFERER_LENGTH + 1))
389
390
391 /* Store the domainname of a URL in the uue hashtable.
392 * If it already exists, increment the nhits counter.
393 */
ufdbRegisterUnknownURL(char * webserver,int portnumber,char * referer)394 void ufdbRegisterUnknownURL(
395 char * webserver,
396 int portnumber,
397 char * referer )
398 {
399 int length;
400 unsigned int hv;
401 char * path;
402 struct unknownURLentry * he;
403 struct unknownURLentry ** prev;
404 char domain[UFDB_UUE_DOMAIN_LENGTH];
405
406 if (ufdbGV.reconfig != UFDB_RECONFIGR_NONE || n_uue >= UFDB_UUE_MAXURLS)
407 return;
408
409 /* webserver names that do not have a dot (.) have no domain name */
410 /* and can never be incorporated in the URL database and are skipped here. */
411 if (webserver == NULL)
412 return;
413
414 path = findDomainEnd( webserver );
415 length = path - webserver;
416 if (length == 0 || strchr( webserver, '.' ) == NULL)
417 return;
418
419 if (ufdbGV.debug > 1)
420 ufdbLogMessage( "ufdbRegisterUnknownURL: %s %d %s", webserver, portnumber, path );
421
422 if (length < UFDB_UUE_DOMAIN_LENGTH - 1)
423 {
424 strncpy( domain, webserver, length );
425 domain[length] = '\0';
426 if (portnumber != 80 && length < UFDB_UUE_DOMAIN_LENGTH - 6)
427 sprintf( &domain[length], ":%d", portnumber );
428 }
429 else
430 {
431 char * orig;
432
433 /* the domainname is huge :-( */
434 /* reduce the length of the domainname by stripping as many subdomains as required */
435 orig = webserver;
436 webserver += (length - (UFDB_UUE_DOMAIN_LENGTH - 2 - 2));
437 while (*webserver != '\0' && *webserver != '.')
438 webserver++;
439 if (*webserver == '\0')
440 {
441 ufdbLogError( "ufdbRegisterUnknownURL: cannot store very long domainname \"%s\"", orig );
442 return;
443 }
444 webserver++;
445 path = findDomainEnd( webserver );
446 length = path - webserver;
447 domain[0] = '*';
448 domain[1] = '.';
449 strncpy( domain+2, webserver, length );
450 domain[length+2] = '\0';
451 if (length == 0 || strchr( domain+2, '.' ) == NULL)
452 return;
453 if (portnumber != 80 && length < UFDB_UUE_DOMAIN_LENGTH - 6 - 2 - 1)
454 sprintf( &domain[length], ":%d", portnumber );
455 }
456
457 /* calculate hash */
458 hv = 100003;
459 {
460 char * t;
461 for (t = domain; *t != '\0'; t++)
462 hv = (hv ^ (hv << 11)) ^ (*t * 17);
463 }
464 hv = hv % UFDB_UUE_HASHSIZE;
465
466 ufdb_mutex_lock( &mutex_url_history ); /* ======================================= */
467
468 if (uues == NULL)
469 {
470 uues = (struct unknownURLentry*) ufdbCalloc( sizeof(struct unknownURLentry), UFDB_UUE_MAXURLS );
471 n_uue = 0;
472 }
473
474 /* check again for full table */
475 if (n_uue >= UFDB_UUE_MAXURLS)
476 {
477 ufdb_mutex_unlock( &mutex_url_history ); /* ============= */
478 return;
479 }
480
481 /* find domain in uue table */
482 prev = &uueHash[hv];
483 he = uueHash[hv];
484 while (he != NULL)
485 {
486 if (strcmp( domain, he->domain ) == 0)
487 {
488 /* found in hashtable */
489 /* The URL path is saved in the hashtable, but the first one is never interesting,
490 * so overwrite it with number 2 and 3.
491 */
492 if (he->nhits < 3)
493 {
494 ufdbStrncpy( he->firstpath, path, sizeof(he->firstpath) );
495 if (referer != NULL)
496 ufdbStrncpy( he->referer, referer, sizeof(he->referer) );
497 }
498 he->nhits++;
499 ufdb_mutex_unlock( &mutex_url_history ); /* ============= */
500 return;
501 }
502 prev = &(he->next);
503 he = he->next;
504 }
505
506 /* domain was not found in hashtable; prev points to pointer to assign new entry */
507 he = &uues[n_uue];
508 n_uue++;
509 *prev = he;
510 he->next = NULL;
511 he->nhits = 1;
512 strcpy( he->domain, domain );
513 strcpy( he->firstpath, path );
514 if (referer == NULL)
515 he->referer[0] = '\0';
516 else
517 ufdbStrncpy( he->referer, referer, sizeof(he->referer) );
518
519 ufdb_mutex_unlock( &mutex_url_history ); /* ======================================= */
520
521 return;
522 }
523
524
525 /*
526 * Retrieve all registered uncategorised URLs.
527 * This must be followed by a ufdbResetUnknownURLs().
528 */
ufdbGetUnknownURLs(void)529 char * ufdbGetUnknownURLs( void )
530 {
531 int i;
532 char * tail;
533 struct unknownURLentry * he;
534
535 ufdb_mutex_lock( &mutex_url_history ); // >=============================================
536
537 n_uue = UFDB_UUE_MAXURLS; // prevent additions until call to ufdbResetUnknownURLs()
538
539 if (uue_list == NULL)
540 uue_list = (char*) ufdbMalloc( UUE_LIST_SIZE );
541 tail = uue_list;
542
543 *tail++ = '|';
544 *tail++ = 'N'; // "N1" signals the receiver that the list has "<url> # <nhits>"
545 *tail++ = '1';
546 *tail++ = '|';
547 for (i = 0; i < UFDB_UUE_HASHSIZE; i++)
548 {
549 for (he = uueHash[i]; he != NULL; he = he->next)
550 {
551 tail += sprintf( tail, "%s%s#%ld#%s|", he->domain, he->firstpath, he->nhits, he->referer );
552 if (tail > uue_list + UUE_LIST_SIZE - sizeof(struct unknownURLentry))
553 goto too_many;
554 }
555 }
556 too_many:
557 *tail = '\0';
558
559 ufdb_mutex_unlock( &mutex_url_history ); // <=============================================
560
561 return uue_list;
562 }
563
564
ufdbResetUnknownURLs(void)565 void ufdbResetUnknownURLs( void )
566 {
567 int i;
568
569 ufdb_mutex_lock( &mutex_url_history ); // >=======================================
570
571 if (uues == NULL)
572 {
573 uues = (struct unknownURLentry*) ufdbCalloc( sizeof(struct unknownURLentry), UFDB_UUE_MAXURLS );
574 }
575 n_uue = 0;
576
577 if (uue_list != NULL)
578 {
579 ufdbFree( uue_list );
580 uue_list = NULL;
581 }
582
583 for (i = 0; i < UFDB_UUE_HASHSIZE; i++)
584 uueHash[i] = NULL;
585
586 ufdb_mutex_unlock( &mutex_url_history ); // <=======================================
587 }
588
589
590 #if !UFDB_BARE_METAL_SUPPORT
UFDBopenSocket(const char * serverName,int port)591 int UFDBopenSocket( const char * serverName, int port )
592 {
593 int s;
594 int ret;
595 int sock_parm;
596 long oldflags;
597 time_t t0, t;
598 char servicePort[8];
599 struct addrinfo * addrlist;
600 struct addrinfo * al;
601 struct addrinfo addrinfo_hints;
602 struct timeval tv;
603
604 addrlist = NULL;
605 sprintf( servicePort, "%d", port );
606
607 addrinfo_hints.ai_flags = AI_NUMERICSERV;
608 #ifdef AI_IDN
609 addrinfo_hints.ai_flags |= AI_IDN;
610 #endif
611 #ifdef AI_ADDRCONFIG
612 addrinfo_hints.ai_flags |= AI_ADDRCONFIG;
613 #endif
614 /* use AF_UNSPEC for IPv4 or IPv6; AF_INET for IPv4 only */
615 addrinfo_hints.ai_family = ufdbGV.useAlsoIPv6onWan ? AF_UNSPEC : AF_INET;
616 addrinfo_hints.ai_socktype = SOCK_STREAM;
617 addrinfo_hints.ai_protocol = IPPROTO_TCP;
618 addrinfo_hints.ai_addrlen = 0;
619 addrinfo_hints.ai_addr = NULL;
620 addrinfo_hints.ai_canonname = NULL;
621 addrinfo_hints.ai_next = NULL;
622
623 t0 = time( NULL );
624 ret = getaddrinfo( serverName, servicePort, &addrinfo_hints, &addrlist );
625 if (ret != 0)
626 {
627 t = time( NULL );
628 if (t - t0 >= 4)
629 ufdbLogMessage( "UFDBopenSocket: cannot resolve hostname %s: %s - %s. "
630 "getaddrinfo took %ld seconds to return. Is the DNS server OK? *****",
631 serverName, gai_strerror(ret),
632 ret == EAI_SYSTEM ? strerror(errno) : "errno is not set",
633 (t - t0) );
634 else
635 {
636 ufdbLogMessage( "UFDBopenSocket: cannot resolve hostname '%s': %s - %s",
637 serverName, gai_strerror(ret),
638 ret == EAI_SYSTEM ? strerror(errno) : "errno is not set" );
639 }
640 #ifdef EAGAIN
641 errno = EAGAIN;
642 #endif
643 return -1;
644 }
645
646 int numTries = 0;
647 s = -1;
648 al = addrlist;
649 while (al != NULL && numTries < 6) // try to connect to many returned addresses until one succeeds
650 {
651 char addrbuf[INET6_ADDRSTRLEN+1];
652
653 numTries++;
654 if (al->ai_family == AF_INET)
655 {
656 if (inet_ntop( al->ai_family, &((struct sockaddr_in *) al->ai_addr)->sin_addr, addrbuf,
657 sizeof(addrbuf) ) == NULL)
658 strcpy( addrbuf, "unknown" );
659 }
660 else
661 {
662 if (inet_ntop( al->ai_family, &((struct sockaddr_in6 *) al->ai_addr)->sin6_addr, addrbuf,
663 sizeof(addrbuf) ) == NULL)
664 strcpy( addrbuf, "unknown" );
665 }
666 if (ufdbGV.debug > 1 || ufdbGV.peek)
667 {
668 ufdbLogMessage( "UFDBopenSocket: '%s' resolved to '%s', port %d try #%d",
669 serverName, addrbuf, port, numTries );
670 }
671
672 s = socket( al->ai_family, SOCK_STREAM, 0 );
673 if (s < 0)
674 {
675 if (ufdbGV.debug > 1 || ufdbGV.peek)
676 {
677 ufdbLogMessage( "UFDBopenSocket: could not open socket to '%s' (%s) port %d: %s - %s",
678 serverName, addrbuf, port, strerror(errno),
679 al->ai_next == NULL ? "no other IP addresses to try" : "trying next IP address" );
680 }
681 al = al->ai_next;
682 continue; /* we may not be able to make an IPv6 socket, so continue trying other sockets */
683 }
684
685 /*
686 * Prevent that the connect takes ages. Use an aggressive timeout of 8 seconds.
687 */
688 tv.tv_sec = 8;
689 tv.tv_usec = 0;
690 setsockopt( s, SOL_SOCKET, SO_RCVTIMEO, (void *) &tv, sizeof(tv) );
691 tv.tv_sec = 8;
692 tv.tv_usec = 0;
693 setsockopt( s, SOL_SOCKET, SO_SNDTIMEO, (void *) &tv, sizeof(tv) );
694
695 sock_parm = 200 * 1024;
696 setsockopt( s, SOL_SOCKET, SO_SNDBUF, (void *) &sock_parm, sizeof(sock_parm) );
697
698 /* The RCVTIMEO and SNDTIMEO do not work on all OSes (e.g. Solaris) so
699 * we will set the socket in non-blocking mode, use select with a timeout and
700 * check the socket status for successful connection.
701 * After this, the socket goes back to blocking mode.
702 */
703 oldflags = fcntl( s, F_GETFL, NULL );
704 if (oldflags < 0)
705 oldflags = 0; /* ignore errors */
706 if (fcntl( s, F_SETFL, oldflags|O_NONBLOCK ))
707 {
708 ;
709 }
710
711 /* with anti-aliasing warnings ON, connect/bind cause compiler warnings which we may ignore */
712 if (connect( s, al->ai_addr, al->ai_addrlen) < 0)
713 {
714 if (errno == EINPROGRESS)
715 {
716 struct pollfd pfd;
717 pfd.fd = s;
718 pfd.events = POLLIN|POLLOUT|POLLPRI|POLLHUP;
719 pfd.revents = 0;
720 int rv = poll( &pfd, 1, 8000 );
721 if (ufdbGV.debug > 1) ufdbLogMessage( "poll() returned %d", rv );
722 if (rv < 0)
723 {
724 /* error; fall through */
725 }
726 else if (rv == 0)
727 {
728 /* timed out; fall through */
729 errno = ETIMEDOUT;
730 numTries++; // increment numTries again to have max. 3 timeouts
731 }
732 else
733 {
734 /* poll() signalled that there is I/O. check socket error status */
735 int errorStatus;
736 socklen_t len;
737 len = sizeof(int);
738 if (getsockopt( s, SOL_SOCKET, SO_ERROR, (void *) &errorStatus, &len ) == 0)
739 {
740 if (!errorStatus)
741 {
742 /* go back to blocking mode */
743 if (fcntl( s, F_SETFL, oldflags ))
744 {
745 ;
746 }
747 goto socket_ok;
748 }
749 /* error; fall through */
750 if (errno == 0)
751 errno = ECONNREFUSED;
752 }
753 else
754 {
755 /* error; fall through */
756 }
757 }
758 }
759 if (errno == EINPROGRESS)
760 errno = EAGAIN;
761 ufdbLogError( "UFDBopenSocket: cannot connect to %s/%s port %d: %s - %s",
762 serverName, addrbuf, port, strerror(errno),
763 al->ai_next == NULL ? "no other addresses to try" : "trying next IP address" );
764 close( s );
765 s = -1;
766 }
767
768 al = al->ai_next;
769 }
770 socket_ok:
771 freeaddrinfo( addrlist );
772
773 #if 0
774 if (al == NULL)
775 return -1;
776 #endif
777
778 if (s >= 0)
779 {
780 /*
781 * Prevent long blocking on communication with the other side.
782 */
783 tv.tv_sec = 20;
784 tv.tv_usec = 0;
785 setsockopt( s, SOL_SOCKET, SO_RCVTIMEO, (void *) &tv, sizeof(tv) );
786
787 tv.tv_sec = 20;
788 tv.tv_usec = 0;
789 setsockopt( s, SOL_SOCKET, SO_SNDTIMEO, (void *) &tv, sizeof(tv) );
790 }
791
792 return s;
793 }
794 #endif
795
796
797 #if UFDB_BARE_METAL_SUPPORT
798 // On bare metal we do not have arpa/inet.h nor inet_pton() and inet_ntop()
799 // so to normalise an IPv6 address we simply only downcase and remove leading zeroes
UFDBnormaliseIPv6(const char * address,char * normalised)800 inline static int UFDBnormaliseIPv6(
801 const char * address,
802 char * normalised )
803 {
804 while (*address != '\0')
805 {
806 // start of a field
807 if (*address == '0')
808 {
809 while (*address == '0')
810 address++;
811 if (*address == '\0' || *address == ':')
812 *normalised++ = '0';
813 }
814 while (*address != '\0' && *address != ':')
815 {
816 *normalised = tolower( (int) *address );
817 address++;
818 normalised++;
819 }
820 if (*address == ':')
821 *normalised++ = *address++;
822 }
823 *normalised = '\0';
824
825 return 1; /* successful */
826 }
827
828 #else
829
UFDBnormaliseIPv6(const char * address,char * normalised)830 inline static int UFDBnormaliseIPv6(
831 const char * address,
832 char * normalised )
833 {
834 struct in6_addr ipv6;
835
836 *normalised = '\0';
837
838 if (inet_pton( AF_INET6, address, (void *) &ipv6 ) <= 0)
839 return 0;
840
841 if (inet_ntop( AF_INET6, (void *) &ipv6, normalised, INET6_ADDRSTRLEN ) == NULL)
842 return 0;
843
844 #ifdef UFDB_DEBUG_IPV6
845 ufdbLogMessage( " UFDBnormaliseIPv6 %s -> %s", address, normalised );
846 #endif
847
848 return 1; /* successful */
849 }
850 #endif
851
852
853 #ifndef INET6_ADDRSTRLEN
854 #define INET6_ADDRSTRLEN 48
855 #endif
856
UFDBparseIPv6address(char * url,char * domain)857 char * UFDBparseIPv6address(
858 char * url,
859 char * domain )
860 {
861 char * url_start;
862 char * d;
863 char normalisedAddress[INET6_ADDRSTRLEN];
864
865 #ifdef UFDB_DEBUG_IPV6
866 ufdbLogMessage( " UFDBparseIPv6address: url: %s", url );
867 #endif
868
869 url_start = url;
870 *domain = '\0';
871 d = domain;
872 if (*url == '[')
873 {
874 url++;
875 }
876
877 while (*url != '\0')
878 {
879 if (*url == ']')
880 {
881 *d++ = ']';
882 *d = '\0';
883 if (UFDBnormaliseIPv6( domain, normalisedAddress ))
884 strcpy( domain, normalisedAddress );
885 else
886 {
887 if (ufdbGV.debug)
888 ufdbLogMessage( "URL has invalid IPv6 address: %s", *url_start=='[' ? url_start+1 : url_start );
889 return NULL;
890 }
891 /* TODO: handle IPv4 in IPv6 addresses e.g. ::127.0.0.1 */
892 return url;
893 }
894
895 if (*url == ':' || *url == '.' || isxdigit( (int) *url))
896 {
897 *d++ = *url++;
898 }
899 else /* URL address error */
900 {
901 *d = '\0';
902 if (ufdbGV.debug)
903 ufdbLogMessage( "URL has invalid IPv6 address: %s", *url_start=='[' ? url_start+1 : url_start );
904 return NULL;
905 }
906 }
907 *d = '\0';
908
909 #ifdef UFDB_DEBUG_IPV6
910 ufdbLogMessage( " UFDBparseIPv6address: domain: %s", domain );
911 #endif
912
913 if (UFDBnormaliseIPv6( domain, normalisedAddress ))
914 {
915 #ifdef UFDB_DEBUG_IPV6
916 ufdbLogMessage( " IPv6 domain '%s' normalised to '%s'", domain, normalisedAddress );
917 #endif
918 strcpy( domain, normalisedAddress );
919 }
920 else
921 {
922 if (ufdbGV.debug)
923 ufdbLogMessage( "URL has invalid IPv6 address: %s", *url_start=='[' ? url_start+1 : url_start );
924 return NULL; /* address error */
925 }
926
927 /* TODO: handle IPv4 in IPv6 addresses e.g. ::127.0.0.1 */
928 return url;
929 }
930
931
UFDBupdateURLwithNormalisedDomain(char * url,char * newDomain)932 void UFDBupdateURLwithNormalisedDomain(
933 char * url,
934 char * newDomain )
935 {
936 #ifdef UFDB_DEBUG_IPV6
937 char * oldURL;
938 #endif
939 char * oldEnd;
940 int n;
941 int nbytes;
942
943 #ifdef UFDB_DEBUG_IPV6
944 oldURL = url;
945 ufdbLogMessage( " UFDBupdateURLwithNormalisedDomain: %s", url );
946 #endif
947
948 if (*url != '[')
949 {
950 ufdbLogError( "UFDBupdateURLwithNormalisedDomain: URL does not start with '[': %s", url );
951 return;
952 }
953 url++;
954
955 oldEnd = strchr( url, ']' );
956 if (oldEnd == NULL)
957 {
958 ufdbLogError( "UFDBupdateURLwithNormalisedDomain: URL does not have a ']': %s", url );
959 return;
960 }
961
962 while (1)
963 {
964 if (*url == ']')
965 {
966 if (*newDomain == '\0') /* the normalised domain name has equal length */
967 return;
968 /* the newDomain string is longer than the original */
969 n = strlen( newDomain );
970 nbytes = strlen( url ) + 1;
971 memmove( url+n, url, nbytes );
972 while (*newDomain != '\0')
973 *url++ = *newDomain++;
974 return;
975 }
976
977 if (*newDomain == '\0')
978 {
979 /* the newDomain string is shorter than the original */
980 nbytes = strlen( oldEnd ) + 1;
981 memmove( url, oldEnd, nbytes );
982 #ifdef UFDB_DEBUG_IPV6
983 ufdbLogMessage( " UFDBupdateURLwithNormalisedDomain: %s", oldURL );
984 #endif
985 return;
986 }
987
988 *url++ = *newDomain++;
989 }
990 }
991
992
UFDBnormaliseIPv4(char * domain)993 void UFDBnormaliseIPv4( char * domain )
994 {
995 char * d;
996 char * orig;
997 unsigned int octetvalue;
998 char dbuf[512];
999
1000 orig = domain;
1001 d = dbuf;
1002 while (*domain != '\0')
1003 {
1004 if (*domain == '0')
1005 {
1006 domain++;
1007 octetvalue = 0;
1008 if (*domain == 'x') /* obfuscated hexadecimal octet */
1009 {
1010 domain++;
1011 while (isxdigit((int) *domain))
1012 {
1013 octetvalue *= 16;
1014 if (*domain >= '0' && *domain <= '9')
1015 octetvalue += (*domain - '0');
1016 else
1017 octetvalue += (*domain - 'a' + 10);
1018 domain++;
1019 }
1020 if (*domain != '\0' && *domain != '.')
1021 {
1022 ufdbLogError( "IPv4 address has illegal hexadecimal octet: %s", orig );
1023 return;
1024 }
1025 }
1026 else if (*domain >= '0' && *domain <= '7') /* obfuscated octal octet */
1027 {
1028 while (*domain >= '0' && *domain <= '7')
1029 {
1030 octetvalue *= 8;
1031 octetvalue += (*domain - '0');
1032 domain++;
1033 }
1034 if (*domain != '\0' && *domain != '.')
1035 {
1036 ufdbLogError( "IPv4 address has illegal octal octet: %s", orig );
1037 return;
1038 }
1039 }
1040 else
1041 {
1042 ufdbLogError( "IPv4 address has illegal octet: %s", orig );
1043 return;
1044 }
1045 if (octetvalue > 255)
1046 {
1047 ufdbLogError( "obfuscated IPv4 address has illegal octet value: %s", orig );
1048 return;
1049 }
1050 /* convert the octetvalue to a decimal string */
1051 d += sprintf( d, "%u", octetvalue );
1052 }
1053 else /* octet is not obfuscated */
1054 {
1055 while (*domain != '\0' && *domain != '.')
1056 *d++ = *domain++;
1057 }
1058
1059 if (*domain == '.')
1060 *d++ = *domain++;
1061 }
1062 *d = '\0';
1063
1064 if (ufdbGV.debug > 1)
1065 ufdbLogMessage( "obfuscated domain %s rewritten to %s", orig, dbuf );
1066
1067 strcpy( orig, dbuf );
1068 }
1069
1070
UFDBaddYoutubeEdufilter(char * domain,char * strippedURL,char * originalURL)1071 int UFDBaddYoutubeEdufilter(
1072 char * domain,
1073 char * strippedURL,
1074 char * originalURL )
1075 {
1076 char * dot;
1077
1078 if (strcmp( domain, "youtube.com" ) == 0)
1079 {
1080 char * id = ufdbGV.YoutubeEdufilterID;
1081 if (id == NULL)
1082 return UFDB_API_OK;
1083 #if 0
1084 ufdbLogMessage( " YouTube Edufilter: %s %s", domain, strippedURL );
1085 #endif
1086 dot = strrchr( strippedURL, '.' );
1087 if (dot == NULL)
1088 {
1089 if (strchr( strippedURL, '?' ) == NULL)
1090 strcat( originalURL, "?edufilter=" );
1091 else
1092 strcat( originalURL, "&edufilter=" );
1093 strcat( originalURL, id );
1094
1095 return UFDB_API_MODIFIED_FOR_YOUTUBE_EDUFILTER;
1096 }
1097 else
1098 {
1099 if (strcmp( dot+1, "css" ) != 0 &&
1100 strcmp( dot+1, "ico" ) != 0 &&
1101 strcmp( dot+1, "gif" ) != 0 &&
1102 strcmp( dot+1, "jpg" ) != 0 &&
1103 strcmp( dot+1, "png" ) != 0 &&
1104 strcmp( dot+1, "js" ) != 0 &&
1105 strcmp( dot+1, "xml" ) != 0)
1106 {
1107 if (strchr( dot, '?' ) == NULL)
1108 strcat( originalURL, "?edufilter=" );
1109 else
1110 strcat( originalURL, "&edufilter=" );
1111 strcat( originalURL, id );
1112
1113 return UFDB_API_MODIFIED_FOR_YOUTUBE_EDUFILTER;
1114 }
1115 }
1116 }
1117
1118 return UFDB_API_OK;
1119 }
1120
1121
1122 /*
1123 * UFDBaddSafeSearch - modify a URL for a search which requires SafeSearch
1124 *
1125 * return UFDB_API_OK for unmodified URLs and UFDB_API_MODIFIED_FOR_SAFESEARCH
1126 *
1127 * parameters: domain - the domainname
1128 * strippedURL - the stripped URL including the domainname
1129 * originalURL - the unmodified user-supplied URL
1130 * The originalURL must be of type char[UFDB_MAX_URL_LENGTH]
1131 * and may be modified to force SafeSearch.
1132 */
UFDBaddSafeSearch(char * domain,char * strippedURL,char * originalURL)1133 int UFDBaddSafeSearch(
1134 char * domain,
1135 char * strippedURL,
1136 char * originalURL )
1137 {
1138 char * slash;
1139
1140 originalURL[UFDB_MAX_URL_LENGTH-28] = '\0';
1141
1142 slash = strchr( strippedURL, '/' );
1143 if (slash == NULL)
1144 strippedURL = (char *) "";
1145 else
1146 strippedURL = slash;
1147
1148 #if 0
1149 ufdbLogMessage( " SS: %s %s", domain, strippedURL );
1150 #endif
1151
1152 if (strstr( domain, "similar-images.googlelabs." ) != NULL && /* Google images */
1153 strstr( strippedURL, "q=") != NULL)
1154 {
1155 strcat( originalURL, "&safe=active&safeui=on" );
1156 UFDB_API_num_safesearch++;
1157 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1158 }
1159 else
1160 if (strstr( domain, "images.google." ) != NULL && /* Google images */
1161 strstr( strippedURL, "q=") != NULL)
1162 {
1163 strcat( originalURL, "&safe=active&safeui=on" );
1164 UFDB_API_num_safesearch++;
1165 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1166 }
1167 else
1168 if (strstr( domain, "youtube.com" ) != NULL && /* Youtube */
1169 strstr( strippedURL, "search_query=") != NULL)
1170 {
1171 strcat( originalURL, "&safety_mode=true" ); /* unfortunately this does not work since */
1172 UFDB_API_num_safesearch++; /* also need to set Cookie: (.*) PREF=f2=8000000 */
1173 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1174 }
1175 else
1176 if ((domain[0] <= '9' && domain[0] >= '0') && /* google-related sites like www.google-tr.info */
1177 strstr( strippedURL, "cx=partner" ) != NULL &&
1178 strstr( strippedURL, "/cse" ) != NULL &&
1179 strstr( strippedURL, "q=" ) != NULL)
1180 {
1181 strcat( originalURL, "&safe=active" );
1182 UFDB_API_num_safesearch++;
1183 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1184 }
1185 else
1186 if ((domain[0] <= '9' && domain[0] >= '0') && /* google.com, google.de, google.ws etc. */
1187 strncmp( strippedURL, "/search", 7 ) == 0 &&
1188 strstr( strippedURL, "q=" ) != NULL &&
1189 (strncmp( domain, "74.125.", 7 ) == 0 ||
1190 strncmp( domain, "173.194.", 8 ) == 0))
1191 {
1192 strcat( originalURL, "&safe=active" );
1193 UFDB_API_num_safesearch++;
1194 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1195 }
1196 else
1197 if ((strstr( domain, "google." ) != NULL || /* SAFESEARCH: google.* */
1198 strcmp( domain, "ajax.googleapis.com" ) == 0 ||
1199 strstr( domain, "googleusercontent.com" ) != NULL) && /* Google */
1200 strstr( strippedURL, "q=" ) != NULL &&
1201 ((strncmp( strippedURL, "/insights", 9 ) != 0 && strstr( strippedURL, "/search" ) != NULL) ||
1202 strstr( strippedURL, "/uds/afs" ) != NULL ||
1203 strstr( strippedURL, "/uds/gwebsearch" ) != NULL ||
1204 strstr( strippedURL, "/uds/gvideosearch" ) != NULL ||
1205 strstr( strippedURL, "/uds/gimagesearch" ) != NULL ||
1206 strstr( strippedURL, "/uds/gblogsearch" ) != NULL ||
1207 strstr( strippedURL, "/videosearch" ) != NULL ||
1208 strstr( strippedURL, "/blogsearch" ) != NULL ||
1209 strstr( strippedURL, "/gwebsearch" ) != NULL ||
1210 strstr( strippedURL, "/groups" ) != NULL ||
1211 strstr( strippedURL, "/cse" ) != NULL ||
1212 strstr( strippedURL, "/products" ) != NULL ||
1213 strstr( strippedURL, "/images" ) != NULL ||
1214 strstr( strippedURL, "/custom" ) != NULL) )
1215 {
1216 char * safe;
1217 /* search for 'safe=off' and replace by 'safe=active' */
1218 safe = strstr( originalURL, "&safe=off" );
1219 if (safe != NULL)
1220 {
1221 safe += 6;
1222 *safe++ = 'a'; /* 'o' */
1223 *safe++ = 'c'; /* 'f' */
1224 *safe++ = 't'; /* 'f' */
1225 (void) memmove( safe+3, safe, strlen(safe)+1 );
1226 *safe++ = 'i';
1227 *safe++ = 'v';
1228 *safe = 'e';
1229 }
1230 strcat( originalURL, "&safe=active&safeui=on" );
1231 UFDB_API_num_safesearch++;
1232 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1233 }
1234 else
1235 if (strstr( domain, "webmenu.com" ) != NULL && /* SAFESEARCH: webmenu.com */
1236 (strstr( strippedURL, "q_or=") != NULL ||
1237 strstr( strippedURL, "q_and=") != NULL ||
1238 strstr( strippedURL, "ss=") != NULL ||
1239 strstr( strippedURL, "keyword=") != NULL ||
1240 strstr( strippedURL, "query=") != NULL) )
1241 {
1242 char * p;
1243 /* TODO: fix problem of cookie override; a user can set preferences to turn the filter OFF
1244 * in the user preferences.
1245 */
1246 while ((p = strstr( originalURL, "&ss=n" )) != NULL)
1247 *(p+4) = 'y';
1248 strcat( originalURL, "&ss=y" );
1249 UFDB_API_num_safesearch++;
1250 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1251 }
1252 else
1253 if (strstr( domain, "blekko.com" ) != NULL && /* SAFESEARCH: blekko.com */
1254 strncmp( strippedURL, "/ws/", 4 ) == 0)
1255 {
1256 if (strchr( strippedURL, '?' ) == NULL)
1257 strcat( originalURL, "?safesearch=2" );
1258 else
1259 strcat( originalURL, "&safesearch=2" );
1260 UFDB_API_num_safesearch++;
1261 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1262 }
1263 else
1264 if (strstr( domain, "izito." ) != NULL && /* SAFESEARCH: izito.* */
1265 (strstr( strippedURL, "query=" ) != NULL ||
1266 strstr( strippedURL, "q=" ) != NULL))
1267 {
1268 strcat( originalURL, "&ss=y" );
1269 UFDB_API_num_safesearch++;
1270 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1271 }
1272 else
1273 if (strstr( domain, "zapmeta." ) != NULL && /* SAFESEARCH: zapmeta.* */
1274 strstr( strippedURL, "vid=" ) != NULL &&
1275 strstr( strippedURL, "q=" ) != NULL)
1276 {
1277 strcat( originalURL, "&ss=y" );
1278 UFDB_API_num_safesearch++;
1279 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1280 }
1281 else
1282 if (strstr( domain, "bing.com" ) != NULL && /* SAFESEARCH: bing. */
1283 strstr( strippedURL, "q=" ) != NULL) /* bing */
1284 {
1285 strcat( originalURL, "&ADLT=STRICT&filt=all" );
1286 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1287 }
1288 else
1289 if (strstr( domain, "bing.co.uk" ) != NULL && /* SAFESEARCH: bing.co.uk */
1290 strstr( strippedURL, "q=" ) != NULL) /* bing */
1291 {
1292 strcat( originalURL, "&ADLT=STRICT&filt=all" );
1293 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1294 }
1295 else
1296 if (strcmp( domain, "api.bing.net" ) == 0 && /* Safesearch: bing API */
1297 strncmp( strippedURL, "/json.aspx", 10 ) == 0 && /* called by searchgby.com */
1298 strstr( strippedURL, "query=" ) != NULL)
1299 {
1300 strcat( originalURL, "&Adult=Strict" );
1301 UFDB_API_num_safesearch++;
1302 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1303 }
1304 else
1305 if (strcmp( domain, "search.searchcompletion.com" ) == 0 && /* SAFESEARCH: searchcompletion.com */
1306 strncmp( strippedURL, "/localsearchresults.aspx", 10 ) == 0 && /* search.searchcompletion.com/LocalSearchResults.aspx */
1307 strstr( strippedURL, "q=" ) != NULL)
1308 {
1309 strcat( originalURL, "&safe=on" ); /* TO-DO: fix this */
1310 UFDB_API_num_safesearch++;
1311 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1312 }
1313 else
1314 if (strstr( domain, "pageset.com" ) != NULL && /* pageset.com */
1315 strstr( strippedURL, "q=" ) != NULL)
1316 {
1317 char * t;
1318 t = strstr( strippedURL, "adt=1" );
1319 if (t != NULL)
1320 *(t+4) = '0';
1321 else
1322 strcat( originalURL, "&adt=0" );
1323 UFDB_API_num_safesearch++;
1324 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1325 }
1326 else
1327 if (strstr( domain, "trovator.com" ) != NULL && /* searchcompletion.com trovator.com */
1328 strstr( strippedURL, "q=" ) != NULL)
1329 {
1330 strcat( originalURL, "&fil=si" );
1331 UFDB_API_num_safesearch++;
1332 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1333 }
1334 else
1335 if (strcmp( domain, "results.searchlock.com" ) == 0 && /* SAFESEARCH: searchlock.com */
1336 strstr( strippedURL, "q=") != NULL)
1337 {
1338 strcat( originalURL, "&sf=1" );
1339 UFDB_API_num_safesearch++;
1340 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1341 }
1342 else
1343 if (strstr( domain, ".yauba.com" ) != NULL && /* SAFESEARCH: yauba.com */
1344 strstr( strippedURL, "query=") != NULL)
1345 {
1346 strcat( originalURL, "&ss=y" );
1347 UFDB_API_num_safesearch++;
1348 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1349 }
1350 else
1351 if (strstr( domain, "forestle.org" ) != NULL && /* SAFESEARCH: forestle.org */
1352 (strstr( strippedURL, "settings") != NULL ||
1353 strstr( strippedURL, "q=") != NULL))
1354 {
1355 strcat( originalURL, "&adultfilter=noadult" );
1356 UFDB_API_num_safesearch++;
1357 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1358 }
1359 else
1360 if (strstr( domain, "zombol.com" ) != NULL && /* SAFESEARCH: zombol.com */
1361 strstr( strippedURL, "/results") != NULL &&
1362 strstr( strippedURL, "q=") != NULL)
1363 {
1364 strcat( originalURL, "&safe=active" );
1365 UFDB_API_num_safesearch++;
1366 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1367 }
1368 else
1369 if (strstr( domain, "kalooga.com" ) != NULL && /* SAFESEARCH: kalooga.com */
1370 strstr( strippedURL, "search") != NULL &&
1371 strstr( strippedURL, "query=") != NULL)
1372 {
1373 strcat( originalURL, "&filter=default" );
1374 UFDB_API_num_safesearch++;
1375 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1376 }
1377 else
1378 if (strstr( domain, "muuler.com" ) != NULL && /* SAFESEARCH: muuler.com */
1379 strstr( strippedURL, "/result") != NULL &&
1380 strstr( strippedURL, "q=") != NULL)
1381 {
1382 strcat( originalURL, "&safe=active" );
1383 UFDB_API_num_safesearch++;
1384 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1385 }
1386 else
1387 if (strstr( domain, "foozir.com" ) != NULL && /* SAFESEARCH: foozir.com */
1388 strstr( strippedURL, "/result") != NULL &&
1389 strstr( strippedURL, "q=") != NULL)
1390 {
1391 strcat( originalURL, "&safe=active" );
1392 UFDB_API_num_safesearch++;
1393 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1394 }
1395 else
1396 if (strstr( domain, "moons.it" ) != NULL && /* SAFESEARCH: moons.it */
1397 strstr( strippedURL, "/ricerca") != NULL &&
1398 strstr( strippedURL, "q=") != NULL)
1399 {
1400 strcat( originalURL, "&safe=active" );
1401 UFDB_API_num_safesearch++;
1402 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1403 }
1404 else
1405 if (strstr( domain, "wotbox.com" ) != NULL && /* SAFESEARCH: wotbox.com */
1406 (strstr( strippedURL, "q=") != NULL ||
1407 strstr( strippedURL, "op0=") != NULL) )
1408 {
1409 strcat( originalURL, "&a=true" );
1410 UFDB_API_num_safesearch++;
1411 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1412 }
1413 else
1414 if (strstr( domain, "ant.com" ) != NULL && /* SAFESEARCH: ant.com */
1415 strstr( strippedURL, "antq=") != NULL)
1416 {
1417 strcat( originalURL, "&safe=1" );
1418 UFDB_API_num_safesearch++;
1419 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1420 }
1421 else
1422 if ((strstr( domain, "duck.co" ) != NULL || /* SAFESEARCH: duck.co */
1423 strstr( domain, "duckduckgo.org" ) != NULL || /* SAFESEARCH: duckduckgo.org */
1424 strstr( domain, "duckduckgo.com" ) != NULL) && /* SAFESEARCH: duckduckgo.com */
1425 strstr( strippedURL, "q=") != NULL)
1426 {
1427 strcat( originalURL, "&kp=1" );
1428 UFDB_API_num_safesearch++;
1429 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1430 }
1431 else
1432 if ((strstr( domain, "qbyrd.com" ) != NULL || /* SAFESEARCH: qbyrd.com */
1433 strstr( domain, "search-results.com" ) != NULL) && /* SAFESEARCH: search-results.com */
1434 strstr( strippedURL, "q=") != NULL)
1435 {
1436 char * adt;
1437 adt = strstr( originalURL, "adt=1" );
1438 if (adt != NULL)
1439 *(adt+4) = '0';
1440 else
1441 strcat( originalURL, "&adt=0" );
1442 UFDB_API_num_safesearch++;
1443 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1444 }
1445 else
1446 if (strstr( domain, "easysearch.org.uk" ) != NULL && /* SAFESEARCH: easysearch.org.uk */
1447 strstr( strippedURL, "search") != NULL)
1448 {
1449 strcat( originalURL, "&safe=on" );
1450 UFDB_API_num_safesearch++;
1451 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1452 }
1453 else
1454 if (strstr( domain, "ecosia.org" ) != NULL &&
1455 strstr( strippedURL, "q=" ) != NULL) /* SAFESEARCH: ecosia.org */
1456 {
1457 strcat( originalURL, "&safeSearch:1" );
1458 UFDB_API_num_safesearch++;
1459 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1460 }
1461 else
1462 if (strstr( domain, "ask.com" ) != NULL &&
1463 strchr( strippedURL, '?' ) != NULL) /* SAFESEARCH: ask.com */
1464 {
1465 strcat( originalURL, "&adt=0" );
1466 UFDB_API_num_safesearch++;
1467 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1468 }
1469 else
1470 if (strncmp( domain, "api.search.yahoo.", 17 ) == 0 && /* SAFESEARCH: API yahoo.* */
1471 strstr( strippedURL, "query=" ) != NULL)
1472 {
1473 strcat( originalURL, "&adult_ok=0" );
1474 UFDB_API_num_safesearch++;
1475 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1476 }
1477 else
1478 if ((strcmp( domain, "search.aol.com" ) == 0 ||
1479 strstr( domain, ".aolsearch.com" ) != NULL) && /* Safesearch: AOL */
1480 strncmp( strippedURL, "/search", 7 ) == 0 &&
1481 strstr( strippedURL, "q=" ) != NULL)
1482 {
1483 strcat( originalURL, "&safesearch=1&sp_ss=1" );
1484 UFDB_API_num_safesearch++;
1485 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1486 }
1487 else
1488 if ((strstr( domain, ".terra." ) != NULL && /* SAFESEARCH: terra.* */
1489 strstr( domain, "busca" ) != NULL) &&
1490 (strstr( strippedURL, "query=" ) != NULL ||
1491 strstr( strippedURL, "source=" ) != NULL) ) /* .ar .br .cl .co .ec .es */
1492 {
1493 strcat( originalURL, "&npl=%26safe%3dhigh" );
1494 UFDB_API_num_safesearch++;
1495 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1496 }
1497 else
1498 if (strcmp( domain, "search.alot.com" ) == 0 && /* SAFESEARCH: alot.com */
1499 strstr( strippedURL, "q=" ) != NULL)
1500 {
1501 strcat( originalURL, "&f=1" );
1502 UFDB_API_num_safesearch++;
1503 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1504 }
1505 else
1506 if (strstr( domain, "searchalot.com" ) != NULL && /* SAFESEARCH: searchalot.com */
1507 strstr( strippedURL, "q=" ) != NULL)
1508 {
1509 strcat( originalURL, "&safesearch=high" );
1510 UFDB_API_num_safesearch++;
1511 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1512 }
1513 else
1514 if (strstr( domain, "alltheinternet.com" ) != NULL && /* SAFESEARCH: alltheinternet.com */
1515 strstr( strippedURL, "q=" ) != NULL)
1516 {
1517 strcat( originalURL, "&safesearch=high" );
1518 UFDB_API_num_safesearch++;
1519 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1520 }
1521 else
1522 if (strstr( domain, "search.yahoo." ) != NULL && /* SAFESEARCH: yahoo.* */
1523 strstr( strippedURL, "p=" ) != NULL)
1524 {
1525 strcat( originalURL, "&vm=r" );
1526 /* TODO: investigate http://www.yahoo.com/r/sx/ *-http://search.yahoo.com/search */
1527 UFDB_API_num_safesearch++;
1528 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1529 }
1530 else
1531 if (strstr( domain, "inspsearchapi.com" ) != NULL &&
1532 strncmp( strippedURL, "/search", 7 ) == 0)
1533 {
1534 strcat( originalURL, "&family-friendly=on" );
1535 UFDB_API_num_safesearch++;
1536 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1537 }
1538 else
1539 if (strstr( domain, "excite." ) != NULL && /* SAFESEARCH: excite.* */
1540 strstr( strippedURL, "search" ) != NULL &&
1541 strchr( strippedURL, '?' ) != NULL) /* Excite */
1542 {
1543 strcat( originalURL, "&familyfilter=1&splash=filtered" );
1544 UFDB_API_num_safesearch++;
1545 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1546 }
1547 else
1548 if (strncmp( domain, "search.msn.", 11 ) == 0) /* SAFESEARCH: msn.* */
1549 {
1550 if (slash == NULL)
1551 strcat( originalURL, "/" );
1552 strcat( originalURL, "&adlt=strict" );
1553 UFDB_API_num_safesearch++;
1554 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1555 }
1556 else
1557 if (strncmp( domain, "search.live.", 12 ) == 0 && /* SAFESEARCH: live.* */
1558 strstr( strippedURL, "q=" ) != NULL)
1559 {
1560 strcat( originalURL, "&adlt=strict" );
1561 UFDB_API_num_safesearch++;
1562 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1563 }
1564 else
1565 if (strcmp( domain, "api.search.live.net" ) == 0 && /* Safesearch: live API */
1566 strstr( strippedURL, "sources=" ) != NULL)
1567 {
1568 strcat( originalURL, "&adlt=strict" );
1569 UFDB_API_num_safesearch++;
1570 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1571 }
1572 else
1573 if (strstr( domain, "blinkx.com" ) != NULL && /* SAFESEARCH: blinkx.com */
1574 strchr( strippedURL, '?' ) != NULL)
1575 {
1576 strcat( originalURL, "&safefilter=on" );
1577 UFDB_API_num_safesearch++;
1578 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1579 }
1580 else
1581 if (strcmp( domain, "etools.ch" ) == 0 &&
1582 strncmp( strippedURL, "/search", 7 ) == 0)
1583 {
1584 strcat( originalURL, "&safeSearch=true" );
1585 UFDB_API_num_safesearch++;
1586 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1587 }
1588 else
1589 if (strncmp( domain, "search.lycos.", 13 ) == 0) /* SAFESEARCH: lycos.* */
1590 {
1591 if (slash == NULL)
1592 strcat( originalURL, "/" );
1593 strcat( originalURL, "&contentFilter=strict&family=on" );
1594 UFDB_API_num_safesearch++;
1595 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1596 }
1597 else
1598 if (strstr( domain, "dogpile.com" ) != NULL || /* SAFESEARCH: dogpile.com */
1599 strstr( domain, "dogpile.co.uk" ) != NULL) /* SAFESEARCH: dogpile.co.uk */
1600 {
1601 if (slash == NULL)
1602 strcat( originalURL, "/" );
1603 strcat( originalURL, "&adultfilter=heavy" );
1604 UFDB_API_num_safesearch++;
1605 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1606 }
1607 else
1608 if (strstr( domain, "infospace.com" ) != NULL) /* SAFESEARCH: infospace.com */
1609 {
1610 if (slash == NULL)
1611 strcat( originalURL, "/" );
1612 strcat( originalURL, "&familyfilter=1" );
1613 UFDB_API_num_safesearch++;
1614 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1615 }
1616 else
1617 if (strstr( domain, "metacrawler.com" ) != NULL) /* SAFESEARCH: metacrawler.com */
1618 {
1619 if (slash == NULL)
1620 strcat( originalURL, "/" );
1621 strcat( originalURL, "&familyfilter=1" );
1622 UFDB_API_num_safesearch++;
1623 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1624 }
1625 else
1626 if (strstr( domain, "webfetch.com" ) != NULL || /* SAFESEARCH: webfetch.com */
1627 strstr( domain, "webfetch.co.uk" ) != NULL) /* SAFESEARCH: webfetch.co.uk */
1628 {
1629 if (slash == NULL)
1630 strcat( originalURL, "/" );
1631 strcat( originalURL, "&familyfilter=1" );
1632 UFDB_API_num_safesearch++;
1633 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1634 }
1635 else
1636 if (strstr( domain, "webcrawler.com" ) != NULL) /* SAFESEARCH: webcrawler.com */
1637 {
1638 if (slash == NULL)
1639 strcat( originalURL, "/" );
1640 strcat( originalURL, "&familyfilter=1" );
1641 UFDB_API_num_safesearch++;
1642 return UFDB_API_MODIFIED_FOR_SAFESEARCH;
1643 }
1644
1645 return UFDB_API_OK;
1646 }
1647
1648
1649 #if 0
1650 UFDB_GCC_HOT UFDB_GCC_INLINE
1651 static int squeeze_html_char(
1652 char * p,
1653 int * hex )
1654 {
1655 int length;
1656
1657 length = 0;
1658 *hex = 0;
1659 while (*p != '\0' && isxdigit( (int) *p ))
1660 {
1661 int h;
1662 h = (*p <= '9') ? *p - '0' : *p - 'a' + 10;
1663 *hex = *hex * 16 + h;
1664 p++;
1665 length++;
1666 }
1667
1668 #if 0
1669 ufdbLogMessage( " squeeze_html_char hex=%04x length=%d *p=%c", *hex, length, *p );
1670 #endif
1671
1672 if (*p != ';')
1673 return -1; /* '&#xxx' without trailing ';' is not a valid HTML character */
1674
1675 if (*hex == 0)
1676 return length;
1677
1678 if (*hex < 0x0020)
1679 {
1680 if (*hex != '\t' && *hex != '\n' && *hex != '\r' && *hex != '\f')
1681 *hex = ' ';
1682 }
1683 else if (*hex == 0x007f || *hex >= 0x00ff)
1684 {
1685 *hex = ' ';
1686 }
1687 else if (*hex <= 'Z' && *hex >= 'A')
1688 {
1689 *hex += 'a' - 'A';
1690 }
1691
1692 return length;
1693 }
1694 #endif
1695
1696
1697 UFDB_GCC_INLINE UFDB_GCC_HOT
increment_UFDB_API_num_url_lookups(void)1698 static void increment_UFDB_API_num_url_lookups( void )
1699 {
1700 #if 0
1701 UFDB_API_num_url_lookups++; // do not use __sync_add_and_fetch()
1702 #elif defined(__GNUC__) && __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 && __SIZEOF_LONG__ == 4
1703 (void) __sync_add_and_fetch( &UFDB_API_num_url_lookups, 1 );
1704 #elif defined(__GNUC__) && __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 && __SIZEOF_LONG__ == 8
1705 (void) __sync_add_and_fetch( &UFDB_API_num_url_lookups, 1 );
1706 #else
1707 UFDB_SHARED static ufdb_mutex incrMutex = ufdb_mutex_initializer;
1708 ufdb_mutex_lock( &incrMutex );
1709 UFDB_API_num_url_lookups++;
1710 ufdb_mutex_unlock( &incrMutex );
1711 #endif
1712 }
1713
1714
1715 UFDB_GCC_HOT
increment_UFDB_API_num_https(void)1716 void increment_UFDB_API_num_https( void )
1717 {
1718 #if 0
1719 UFDB_API_num_https++; // do not use __sync_add_and_fetch()
1720 #elif defined(__GNUC__) && __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 && __SIZEOF_LONG__ == 4
1721 (void) __sync_add_and_fetch( &UFDB_API_num_https, 1 );
1722 #elif defined(__GNUC__) && __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 && __SIZEOF_LONG__ == 8
1723 (void) __sync_add_and_fetch( &UFDB_API_num_https, 1 );
1724 #else
1725 UFDB_SHARED static ufdb_mutex incrMutex = ufdb_mutex_initializer;
1726 ufdb_mutex_lock( &incrMutex );
1727 UFDB_API_num_https++;
1728 ufdb_mutex_unlock( &incrMutex );
1729 #endif
1730 }
1731
1732
1733 /*
1734 * strip a URL:
1735 * remove http:// prefix,
1736 * remove www[0-9*]. prefix,
1737 * remove port number,
1738 * remove username and password,
1739 * remove IP address obfuscations (numbers with leading zeroes)
1740 * convert hex codes (%61 = 'a') to characters,
1741 * convert special characters to void or space,
1742 * convert characters to lower case.
1743 * substitute // by / in a URL
1744 * substitute /./ by / in a URL
1745 * substitute /foo/../bar by /bar in a URL
1746 * trim very long URLs (more than 512 letters)
1747 * do not remove "#foo"
1748 */
1749 UFDB_GCC_HOT
UFDBstripURL2(char * URL,int stripwwwprefix,char * strippedUrl,char * domain,char * protocol,int * portnumber)1750 void UFDBstripURL2(
1751 char * URL, /* input URL string */
1752 int stripwwwprefix, /* input flag for stripping "www." prefix from URL */
1753 char * strippedUrl, /* output char array (must be UFDB_MAX_URL_LENGTH bytes) */
1754 char * domain, /* output char array (must be 1024 bytes) */
1755 char * protocol, /* output char array (must be 16 bytes) */
1756 int * portnumber ) /* output integer */
1757 {
1758 char * p;
1759 char * tmp;
1760 char * domain_start;
1761 char * domain_end;
1762 char * optional_token;
1763 char * origStrippedUrl;
1764 int port;
1765 int is_ip_address;
1766 int is_ipv6;
1767 int obfuscated;
1768 int insideParams;
1769 char buffer[UFDB_MAX_URL_LENGTH];
1770
1771 /*
1772 * This parser has the following parsing stages:
1773 * 1) parse protocol and save it (output).
1774 * 2) skip optional username:password@ and initial "www[nnn]."
1775 * 3) parse FQDN, stop at ':' (port number) or any of / ; ? # (start of path)
1776 * if the FQDN starts with '[' the URL has an IPv6 and parsing is done by separate function.
1777 * 4) store optional port (output) (default is 80, or 443 if protocol is https)
1778 * URL without protocol, username/password, port is now stored in temporary buffer 'buffer'
1779 * NEW in v1.32: the parameters of the URL are also copied to the buffer.
1780 * 5) The FDQN is saved in domainname (output).
1781 * 6) The URL buffer is converted to lowercase but parameters are not converted.
1782 * 7) Check for obfuscated IP addresses and normalise them
1783 * 8) Copy/convert the buffer to strippedURL (output) converting %xx and &#xx; and // and /./ and /../
1784 */
1785
1786 *portnumber = 80;
1787 is_ipv6 = 0;
1788
1789 increment_UFDB_API_num_url_lookups();
1790
1791 /* strip http:// and ftp:// protocol header */
1792 p = findProtocolEnd( URL );
1793 if (p != NULL)
1794 {
1795 int n;
1796 n = p - URL;
1797 if (n == 0 || n > 14)
1798 {
1799 /* ERROR: an absent or very large protocol name... The URL does not start with a valid protocol.
1800 */
1801 strcpy( protocol, "http" );
1802 p = URL;
1803 }
1804 else
1805 {
1806 memcpy( protocol, URL, n );
1807 protocol[n] = '\0';
1808 if (n == 5 && strcasecmp( protocol, "https" ) == 0)
1809 {
1810 increment_UFDB_API_num_https();
1811 *portnumber = 443;
1812 }
1813 p += 3; /* skip '://' */
1814 }
1815 }
1816 else
1817 {
1818 strcpy( protocol, "http" );
1819 p = URL;
1820 }
1821
1822 domain_end = findDomainEnd( p ); /* might not be accurate and skipped the ':port' */
1823
1824 optional_token = strchr_before( p, domain_end, '@' ); /* strip user:password@ */
1825 if (optional_token != NULL)
1826 p = optional_token + 1;
1827
1828 domain_start = p;
1829
1830 #if 0
1831 if (ufdbGV.debug)
1832 ufdbLogMessage( " UFDBstripURL2: p: %s\n", p );
1833 #endif
1834
1835 if (*p == '[') /* IPv6 URL: http://[::1]:80/index.html */
1836 {
1837 char * end;
1838 char * oldBracket;
1839
1840 is_ipv6 = 1;
1841 oldBracket = strchr( p, ']' );
1842 if (oldBracket != NULL)
1843 *oldBracket = '\0';
1844 end = UFDBparseIPv6address( p, domain );
1845 if (oldBracket != NULL)
1846 *oldBracket = ']';
1847 if (end != NULL)
1848 {
1849 UFDBupdateURLwithNormalisedDomain( p, domain );
1850 /* uh-oh: the normalised domain is usually smaller and our pointers have moved */
1851 domain_end = findDomainEnd( p );
1852 oldBracket = strchr( p, ']' );
1853 if (oldBracket == NULL)
1854 oldBracket = domain_end - 1;
1855
1856 optional_token = strchr_before( oldBracket, domain_end, ':' );
1857 #if 0
1858 if (ufdbGV.debug > 1)
1859 ufdbLogMessage( " UFDBstripURL2: domain_end: %08x oldBracket: %08x token: %08x\n",
1860 domain_end, oldBracket, optional_token );
1861 #endif
1862 }
1863 else
1864 optional_token = NULL;
1865 }
1866 else
1867 {
1868 if (stripwwwprefix) /* strip www[0-9]{0,2}. */
1869 {
1870 if ((p[0] == 'w' || p[0] == 'W') &&
1871 (p[1] == 'w' || p[1] == 'W') &&
1872 (p[2] == 'w' || p[2] == 'W'))
1873 {
1874 tmp = p + 3;
1875 if (*tmp <= '9' && *tmp >= '0')
1876 tmp++;
1877 if (*tmp <= '9' && *tmp >= '0')
1878 tmp++;
1879 if (*tmp == '.' && strchr_before( tmp+1, domain_end, '.' ) != NULL)
1880 p = tmp + 1;
1881 }
1882 }
1883 optional_token = strchr_before( p, domain_end, ':' );
1884 }
1885
1886 /* parse-and-strip ":<portnum>" */
1887 tmp = buffer;
1888 if (optional_token != NULL)
1889 { /* copy domain name */
1890 while (p < optional_token)
1891 *tmp++ = *p++;
1892 *tmp = '\0';
1893
1894 p++;
1895 port = 0;
1896 while (*p <= '9' && *p >= '0')
1897 {
1898 port = port * 10 + (*p - '0');
1899 p++;
1900 }
1901
1902 if (port == 443 && *portnumber != 443) /* Squid sends "example.com:443" with CONNECT */
1903 {
1904 increment_UFDB_API_num_https();
1905 strcpy( protocol, "https" );
1906 }
1907
1908 *portnumber = port;
1909 ufdbStrncpy( tmp, p, UFDB_MAX_URL_LENGTH-256-6-1 ); /* copy rest of the URL */
1910 }
1911 while (1)
1912 {
1913 if (*p == '\0' || *p == '#') break;
1914 *tmp++ = *p++;
1915
1916 if (*p == '\0' || *p == '#') break;
1917 *tmp++ = *p++;
1918
1919 if (*p == '\0' || *p == '#') break;
1920 *tmp++ = *p++;
1921
1922 if (*p == '\0' || *p == '#') break;
1923 *tmp++ = *p++;
1924
1925 if (tmp >= &buffer[UFDB_MAX_URL_LENGTH-2-4])
1926 break;
1927 }
1928 *tmp = '\0';
1929
1930 if (!is_ipv6) /* save the original domainname */
1931 {
1932 int n;
1933
1934 if (optional_token != NULL)
1935 domain_end = optional_token;
1936 n = domain_end - domain_start;
1937 if (n >= 1023)
1938 {
1939 strcpy( domain, "domaintoolong.urlfilterdb.com" );
1940 }
1941 else
1942 {
1943 memcpy( domain, domain_start, n );
1944 domain[n] = '\0';
1945 }
1946 }
1947
1948 /*
1949 * Now a temporary URL is in 'buffer'.
1950 * The temporary URL has no protocol, portnum, username/password, initial "www[nnn].".
1951 * Convert URL to lower case but stop at the first '?' to leave parameters untouched.
1952 */
1953 tmp = buffer;
1954 while (*tmp != '\0')
1955 {
1956 if (*tmp == '?')
1957 {
1958 if (!ufdbGV.parseURLparameters) // UFDBstripURL2(): parse parameters or not?
1959 *tmp = '\0';
1960 goto stop_lowercasing;
1961 }
1962 if (*tmp <= 'Z' && *tmp >= 'A')
1963 *tmp += 'a' - 'A';
1964 tmp++;
1965 }
1966 *tmp++ = '\0'; /* prevent problems with % at end of URL */
1967 *tmp = '\0';
1968 stop_lowercasing:
1969
1970 #if 0
1971 if (ufdbGV.debug)
1972 ufdbLogMessage( " UFDBstripURL2: after lowercasing: %s\n", buffer );
1973 #endif
1974
1975 /* scan for IP address obfuscations */
1976 obfuscated = 0;
1977 is_ip_address = 1;
1978 for (tmp = domain; *tmp != '\0'; )
1979 {
1980 if (*tmp == '0')
1981 obfuscated = 1;
1982 if (*tmp == '0' && *(tmp+1) == 'x') /* parse 1 hex octet 0xHH */
1983 {
1984 tmp += 2;
1985 while (*tmp != '\0' && isxdigit((int) *tmp))
1986 tmp++;
1987 if (*tmp != '\0' && *tmp != '.')
1988 {
1989 is_ip_address = 0;
1990 break;
1991 }
1992 }
1993 else if (*tmp <= '9' && *tmp >= '0')
1994 {
1995 while (*tmp != '\0' && (*tmp <= '9' && *tmp >= '0')) /* parse 1 octal or integer octet */
1996 tmp++;
1997 if (*tmp != '\0' && *tmp != '.')
1998 {
1999 is_ip_address = 0;
2000 break;
2001 }
2002 }
2003 else
2004 {
2005 is_ip_address = 0;
2006 break;
2007 }
2008 if (*tmp == '.')
2009 tmp++;
2010 }
2011 if (is_ip_address && obfuscated)
2012 removeObfuscations( domain, buffer );
2013
2014 /*
2015 * Copy the buffer to strippedUrl, while converting hex codes to characters.
2016 * After the first '?' we only do %HH character conversion
2017 */
2018 insideParams = 0;
2019 origStrippedUrl = strippedUrl;
2020 p = buffer;
2021 while (*p != '\0')
2022 {
2023 if (*p == ':' && *(p+1) == '/' && *(p+2) == '/') /* do not replace :// by :/ */
2024 {
2025 *strippedUrl++ = *p++;
2026 *strippedUrl++ = *p++;
2027 *strippedUrl++ = *p++;
2028 }
2029 else if (*p == '%') /* start of a HEX code */
2030 {
2031 if (isxdigit((int) *(p+1)) && isxdigit((int) *(p+2)))
2032 {
2033 char h;
2034 int hex;
2035
2036 h = *(p+1);
2037 if (h <= '9')
2038 hex = (h - '0') * 16;
2039 else if (h <= 'F')
2040 hex = (h - 'A' + 10) * 16;
2041 else
2042 hex = (h - 'a' + 10) * 16;
2043 h = *(p+2);
2044 if (h <= '9')
2045 hex += (h - '0');
2046 else if (h <= 'F')
2047 hex += (h - 'A' + 10);
2048 else
2049 hex += (h - 'a' + 10);
2050 /* be careful with control characters */
2051 if (hex < 0x20)
2052 {
2053 if (hex == 0)
2054 {
2055 p += 3;
2056 continue;
2057 }
2058 hex = ' ';
2059 *strippedUrl++ = hex;
2060 p += 3;
2061 }
2062 else
2063 {
2064 if (!insideParams && hex <= 'Z' && hex >= 'A')
2065 hex += 'a' - 'A';
2066 else if (hex == 0x7f)
2067 hex = ' ';
2068
2069 *strippedUrl++ = hex;
2070 p += 3;
2071 if (hex == ':' && *(p) == '/' && *(p+1) == '/') // do not replace :// by :/
2072 {
2073 *strippedUrl++ = '/';
2074 *strippedUrl++ = '/';
2075 p += 2;
2076 }
2077 }
2078 }
2079 else /* erroneous code */
2080 {
2081 *strippedUrl++ = *p++; /* just copy the '%' */
2082 }
2083 }
2084 else /* plain character */
2085 {
2086 while (*p == '/')
2087 {
2088 if (*(p+1) == '/') /* substitute // by / but not in "xxx://" */
2089 p++;
2090 else if (*(p+1) == '.' && *(p+2) == '/') /* substitute /./ by / */
2091 p += 2;
2092 else if (*(p+1) == '.' && *(p+2) == '.' && *(p+3) == '/') /* substitute /xxxx/../ by / */
2093 {
2094 /* try to find the previous directory... */
2095 char * tmp;
2096 tmp = strippedUrl - 1;
2097 while (*tmp != '/' && tmp > origStrippedUrl)
2098 tmp--;
2099 if (tmp > origStrippedUrl)
2100 {
2101 strippedUrl = tmp;
2102 p += 3;
2103 }
2104 else
2105 break;
2106 }
2107 else
2108 break;
2109 }
2110 if (*p == '?') /* just copy the '?' */
2111 {
2112 insideParams = 1;
2113 *strippedUrl++ = *p++;
2114 }
2115 else
2116 #if 0
2117 if (*p == '#')
2118 {
2119 break;
2120 }
2121 else
2122 #endif
2123 *strippedUrl++ = *p++;
2124 }
2125 }
2126 *strippedUrl++ = '\0';
2127 *strippedUrl = '\0'; /* sset2 requires double \0 termination */
2128 }
2129
2130
2131 UFDB_GCC_HOT
UFDBstripURL(char * URL,char * strippedUrl,char * domain,char * protocol,int * portnumber)2132 void UFDBstripURL(
2133 char * URL, /* input URL string */
2134 char * strippedUrl, /* output char array (must be UFDB_MAX_URL_LENGTH bytes) */
2135 char * domain, /* output char array (must be 1024 bytes) */
2136 char * protocol, /* output char array (must be 16 bytes) */
2137 int * portnumber ) /* output integer */
2138 {
2139 UFDBstripURL2( URL, 1, strippedUrl, domain, protocol, portnumber );
2140 }
2141
2142
UFDBprintable(char * string)2143 char * UFDBprintable( char * string )
2144 {
2145 char * p;
2146
2147 if (string == NULL)
2148 return (char *) "NULL";
2149
2150 p = string;
2151 while (*p != '\0')
2152 {
2153 if (*p < 32 || *p > 126)
2154 *p = '?';
2155 p++;
2156 }
2157
2158 return string;
2159 }
2160
2161
2162 #if !UFDB_BARE_METAL_SUPPORT
ufdbGetSysInfo(struct utsname * si)2163 void ufdbGetSysInfo(
2164 struct utsname * si )
2165 {
2166 if (uname( si ) < 0)
2167 {
2168 strcpy( si->machine, "M?" );
2169 strcpy( si->release, "R?" );
2170 strcpy( si->nodename, "unknown" );
2171 strcpy( si->sysname, "sysname" );
2172 }
2173 else
2174 {
2175 si->machine[ sizeof(si->machine)-1 ] = '\0';
2176 si->release[ sizeof(si->release)-1 ] = '\0';
2177 si->sysname[ sizeof(si->sysname)-1 ] = '\0';
2178
2179 (void) gethostname( si->nodename, sizeof(si->nodename) );
2180 si->nodename[ sizeof(si->nodename)-1 ] = '\0';
2181 }
2182 }
2183
2184
ufdbGetNumCPUs(void)2185 long ufdbGetNumCPUs( void )
2186 {
2187 long num_cpus;
2188
2189 #if defined(_SC_NPROCESSORS_ONLN)
2190 num_cpus = sysconf( _SC_NPROCESSORS_ONLN );
2191
2192 #elif defined(__NR_sched_getaffinity)
2193 /* sched_setaffinity() is buggy on linux 2.4.x so we use syscall() instead */
2194 cpu = syscall( __NR_sched_getaffinity, getpid(), 4, &cpu_mask );
2195 /* printf( "sched_getaffinity returned %d %08lx\n", cpu, cpu_mask ); */
2196 if (cpu >= 0)
2197 {
2198 num_cpus = 0;
2199 for (cpu = 0; cpu < 32; cpu++)
2200 if (cpu_mask & (1 << cpu))
2201 num_cpus++;
2202 /* printf( " found %d CPUs in the cpu mask\n", num_cpus ); */
2203 }
2204 else
2205 #else
2206 num_cpus = 0;
2207 #endif
2208
2209 return num_cpus;
2210 }
2211 #endif
2212
2213
UFDBcalcCksum(char * mem,long size)2214 int UFDBcalcCksum( char * mem, long size )
2215 {
2216 unsigned int cksum = 17;
2217
2218 while (--size >= 0)
2219 {
2220 cksum = cksum * 13 + ((unsigned int) *mem++) * 3;
2221 }
2222 return (int) (cksum % 100000);
2223 }
2224
2225
2226 #if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
2227 // #pragma GCC pop_options
2228 #endif
2229
2230
2231 #ifdef __cplusplus
2232 }
2233 #endif
2234