1 /* bzflag
2  * Copyright (c) 1993-2021 Tim Riker
3  *
4  * This package is free software;  you can redistribute it and/or
5  * modify it under the terms of the license found in the file
6  * named COPYING that should have accompanied this file.
7  *
8  * THIS PACKAGE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
9  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
10  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
11  */
12 
13 // this classes interface
14 #include "WordFilter.h"
15 
16 // system headers
17 #include <ctype.h>
18 #include <string>
19 
20 // implementation-specific headers
21 #ifdef DEBUG
22 #  include "TimeKeeper.h"
23 #endif
24 
25 
26 /* private */
27 
28 /* protected */
29 
simpleFilter(char * input) const30 bool WordFilter::simpleFilter(char *input) const
31 {
32     bool filtered = false;
33     filter_t findWord;
34 
35     std::string line = input;
36     int startPosition = line.find_first_of(alphabet);
37 
38     int endPosition;
39     std::string word;
40     unsigned int firstchar;
41 
42     /* here we iterate over all of the words in the input and replace
43      * exact matches with asterisks
44      */
45     while (startPosition >= 0)
46     {
47         endPosition = line.find_first_not_of(alphabet, startPosition+1);
48         if (endPosition < 0)
49             endPosition = line.length();
50 
51         word = line.substr(startPosition, endPosition-startPosition);
52         // transform to lowercase
53         std::transform (word.begin(), word.end(), word.begin(), tolower);
54 
55         findWord.word = word;
56         firstchar = (unsigned char)word[0];
57         if (filters[firstchar].find(findWord) != \
58                 filters[firstchar].end())
59         {
60 
61             /* fill with asterisks */
62             //      memset(input+startPosition,'*', endPosition-startPosition);
63 
64             /* fill with random filter chars */
65             if (filterCharacters(input, startPosition, endPosition-startPosition, true) > 0)
66                 filtered=true;
67         }
68         startPosition = line.find_first_of(alphabet, endPosition);
69     }
70     return filtered;
71 } // end simpleFilter
72 
73 
aggressiveFilter(char * input) const74 bool WordFilter::aggressiveFilter(char *input) const
75 {
76 #if  !defined(HAVE_REGEX_H)
77 
78     std::cerr << "Regular expressions are not available (using the simple filter)" << std::endl;
79     return simpleFilter(input);
80 
81 #else /* HAVE_REGEX_H */
82 
83     bool filtered = false;
84     regmatch_t match[1];
85     if (input == NULL) return false;
86     int inputLength = strlen(input);
87 
88     // a buffer to destroy during matching (includes terminating null)
89     std::string sInput = input;
90 
91     /* maintain an array of match indices of the input; values are in
92      * pairs.  the first number is the start position, the second is
93      * a length.  array has initial size of 256 (128 words).
94      */
95     std::vector<int> matchPair(MAX_FILTER_SETS);
96 
97     // how many match pairs are stored
98     unsigned int matchCount = 0;
99 
100     // stores which letters have been used already
101     std::string wordIndices;
102 
103     /* here, we get a list of characters that might be the start of a word.
104      * characters that are preceded by a non-alphabetic character are added
105      * to a list of letters.  e.g. the first line of this comment consists
106      * of the letters:  hwgaloctmbs
107      * This means that we will only search through filter words that begin
108      * with those letters.
109      */
110     char previousChar = 0;
111     for (int counter = 0; counter < inputLength; counter++)
112     {
113         char c = tolower(sInput[counter]);
114 
115         if (!isalpha(previousChar) && TextUtils::isVisible(c))
116         {
117 
118             // expand punctuation to potential alphabetic characters
119             if (TextUtils::isPunctuation(c))
120             {
121 
122                 std::string puncChars = alphabeticSetFromCharacter(c);
123                 for (unsigned int cnt = 0; cnt < puncChars.size(); cnt++)
124                     appendUniqueChar(wordIndices, tolower(puncChars[cnt]));
125             }
126 
127             // add the character itself if not previously added
128             appendUniqueChar(wordIndices, c);
129         }
130         previousChar = c;
131     }
132 
133     /* check the input for a words that begin with a prefix.  if it does,
134      * then add the next letter as a letter to check for during matching.
135      * e.g. "bzstring" will match a prefix of "bz" and make "s" get added
136      * as a bin to check for during matching.
137      */
138     for (ExpCompareSet::const_iterator i = prefixes.begin(); i != prefixes.end(); ++i)
139     {
140         if (regexec(i->compiled, sInput.c_str(), 1, match, 0) == 0)
141         {
142             if ( (match[0].rm_eo < inputLength) && isalpha(sInput[match[0].rm_eo]) )
143             {
144                 /* do not forget to make sure this is a true prefix */
145                 if ( (match[0].rm_so > 0) && isalpha(sInput[match[0].rm_so - 1]) )
146                     continue;
147 
148                 /* we found a prefix -- add the letter that follows */
149                 appendUniqueChar(wordIndices, tolower(sInput[match[0].rm_eo]));
150             }
151         }
152     }
153 
154 
155 //std::cout << "WordIndexLetters are [" << wordIndices << "]" << std::endl;
156     // now we have a record of all potential word boundary positions
157 
158 
159     /* iterate over the filter words for each unique initial word character */
160     int regCode;
161     for (unsigned int j = 0; j < wordIndices.size(); j++)
162     {
163 
164         /* look at all of the filters that start with the letter wordIndices[j]
165          */
166         const unsigned int firstchar = (unsigned char)wordIndices[j];
167         for (ExpCompareSet::const_iterator i = filters[firstchar].begin();
168                 i != filters[firstchar].end(); ++i)
169         {
170 
171             /* the big kahuna burger processing goes on here */
172             bool matched = true;
173             while (matched)
174             {
175                 matched = false;
176 
177                 regCode = regexec(i->compiled, sInput.c_str(), 1, match, 0);
178 
179 //std::cout << "input is [" << sInput << "]" << std::endl;
180 
181                 if ( regCode == 0 )
182                 {
183                     int startOffset = match[0].rm_so;
184                     int endOffset = match[0].rm_eo;
185 
186 //std::cout << "We matched ... ";
187 
188                     /* make sure we only match on word boundaries */
189                     if ( (startOffset>1) && (isalpha(sInput[startOffset-1])) )
190                     {
191 
192 //std::cout << "but didn't match a word beginning" << std::endl;
193 
194                         /* we are in the middle of a word.. see if we can match a prefix before this */
195                         bool foundit =  false;
196                         for (ExpCompareSet::const_iterator k = prefixes.begin();
197                                 k != prefixes.end(); ++k)
198                         {
199                             if (regexec(k->compiled, sInput.c_str(), 1, match, 0) == 0)
200                             {
201 
202 //std::cout << "checking prefix: " << k->word << std::endl;
203 
204                                 if ( (match[0].rm_so > 1) && (isalpha(sInput[match[0].rm_so - 1])) )
205                                 {
206                                     /* we matched, but we are still in the middle of a word */
207                                     continue;
208                                 }
209 
210 //std::cout << "matched a prefix! " << k->word << std::endl;
211                                 if (match[0].rm_eo == startOffset)
212                                 {
213                                     /* perfect prefix match */
214                                     startOffset = match[0].rm_so;
215                                     foundit = true;
216                                     break;
217                                 }
218                             }
219                         }
220                         if (!foundit)
221                         {
222                             /* couldn't find a prefix, so skip this match */
223 //std::cout << "Could not find a prefix" <<std::endl;
224                             continue;
225                         }
226                     }
227 
228 //std::cout << "is endoffset alphabetic: " << input[endOffset] << std::endl;
229 
230                     if ( (endOffset<inputLength-1) && (isalpha(sInput[endOffset])) )
231                     {
232 
233 //std::cout << "but didn't match a word ending" << std::endl;
234 
235                         /* we are at the start of a word, but not at the end, try to get to the end */
236                         bool foundit = false;
237                         for (ExpCompareSet::const_iterator k = suffixes.begin();
238                                 k != suffixes.end(); ++k)
239                         {
240 //std::cout << "checking " << k->word << " against [" << input + endOffset << "]" << std::endl;
241 
242                             if (regexec(k->compiled, sInput.c_str() + endOffset, 1, match, 0) == 0)
243                             {
244 
245 //std::cout << "is " << match[0].rm_eo << " less than " << inputLength - endOffset << std::endl;
246 //std::cout << "is alpha =?= " << input[endOffset + match[0].rm_eo + 1] << std::endl;
247 
248                                 /* again, make sure we are now at a word end */
249                                 if ( (match[0].rm_eo < inputLength - endOffset) &&
250                                         (isalpha(sInput[endOffset + match[0].rm_eo])) )
251                                 {
252                                     /* we matched, but we are still in the middle of a word */
253                                     continue;
254                                 }
255 
256 //std::cout << "matched a suffix! " << k->word << std::endl;
257                                 if (match[0].rm_so == 0)
258                                 {
259                                     /* push the end forward a little since we matched */
260                                     endOffset += match[0].rm_eo;
261                                     foundit = true;
262                                     break;
263                                 }
264                             }
265                         }
266                         if (!foundit)
267                         {
268                             /* couldn't find a suffix, so skip this match */
269 //std::cout << "Could not find a suffix" <<std::endl;
270                             continue;
271                         }
272                     }
273 
274                     int matchLength = endOffset - startOffset;
275 
276                     // make sure that longer matches actually include at least 1 alphabetic
277                     if (matchLength > 3)
278                     {
279                         bool foundAlpha = false;
280                         for (std::string::const_iterator position = sInput.begin(); position != sInput.end(); position++)
281                         {
282                             if (isalpha(*position))
283                             {
284                                 foundAlpha = true;
285                                 break;
286                             }
287                         }
288                         if (!foundAlpha)
289                             continue;
290                     }
291 
292                     // add a few more slots if necessary (this should be rare/never)
293                     if (matchCount * 2 + 1 >= matchPair.size())
294                         matchPair.resize(matchCount * 2 + 201);
295 
296                     matchPair[matchCount * 2] = startOffset; /* position */
297                     matchPair[(matchCount * 2) + 1] = matchLength; /* length */
298                     matchCount++;
299                     filtered = true;
300                     matched = true;
301                     // zappo! .. erase stuff that has been filtered to speed up future checks
302                     // fill with some non-whitespace alpha that is not the start/end of a suffix to prevent rematch
303                     std::string filler;
304                     filler.assign(matchLength, 'W');
305                     sInput.replace(startOffset, matchLength, filler);
306 
307                 }
308                 else if ( regCode == REG_NOMATCH )
309                 {
310                     // do nothing
311                     continue;
312 
313                 }
314                 else
315                 {
316                     char errorBuffer[256];
317                     regerror(regCode, i->compiled, errorBuffer, 256);
318                     std::cout << errorBuffer << std::endl;
319 
320                 } /* end if regcode */
321 
322             } /* end regexec-ing */
323 
324         } /* iterate over words in a particular character bin */
325 
326     } /* iterate over characters */
327 
328     /* finally filter the input.  only filter actual alphanumerics. */
329     for (unsigned int l = 0; l < matchCount; l++)
330     {
331         /* !!! debug */
332 #ifdef DEBUG
333         char tmp[256] = {0};
334         strncpy(tmp, input + matchPair[l*2], matchPair[(l*2)+1]);
335         std::cout << "Matched: [" << tmp << "]" << std::endl;
336 #endif
337 
338         if (filterCharacters(input, matchPair[l*2], matchPair[(l*2)+1]) <= 0)
339         {
340             // XXX with multiple matching, we will be unable to filter overlapping matches
341             //      std::cerr << "Unable to filter characters" << std::endl;
342             continue;
343         }
344     }
345 
346 
347     return filtered;
348 
349 #endif /* HAVE_REGEX_H */
350 } // end aggressiveFilter
351 
352 
353 // provides a pointer to a fresh compiled expression for some given expression
354 #if !defined(HAVE_REGEX_H)
getCompiledExpression(const std::string &) const355 regex_t *WordFilter::getCompiledExpression(const std::string &) const
356 {
357     return (regex_t *)NULL;
358 }
359 #else /* HAVE_REGEX_H */
getCompiledExpression(const std::string & word) const360 regex_t *WordFilter::getCompiledExpression(const std::string &word) const
361 {
362     regex_t *compiledReg;
363 
364     /* XXX need to convert this to use new/delete */
365     if ( (compiledReg = (regex_t *)calloc(1, sizeof(regex_t))) == NULL )
366     {
367 
368         perror("calloc failed");
369         std::cerr << "Warning: unable to allocate memory for compiled regular expression";
370         return (regex_t *)NULL;
371 
372     }
373 
374     if ( regcomp(compiledReg, word.c_str(), REG_EXTENDED | REG_ICASE) != 0 )
375     {
376         std::cerr << "Warning: unable to compile regular expression for [" << word << "]" << std::endl;
377         free(compiledReg);
378         return (regex_t *)NULL;
379     }
380     return compiledReg;
381 
382 }
383 #endif /* HAVE_REGEX_H */
384 
385 
l33tspeakSetFromCharacter(const char c) const386 std::string WordFilter::l33tspeakSetFromCharacter(const char c) const
387 {
388     std::string set = "";
389 
390     if (!isalnum(c))
391     {
392         /* escape the non-alphanumeric (punctuation or control chars) */
393         set = "  ";
394         set[0] = '\\';
395         set[1] = c;
396         return set;
397     }
398     else if (isspace(c))
399     {
400         set = " ";
401         set[0] = c;
402         return set;
403     }
404 
405     switch (c)
406     {
407     case 'a':
408         set = "a4@";
409         break;
410     case 'b':
411         set = "b8";
412         break;
413     case 'c':
414         set = "c\\(";
415         break;
416     case 'e':
417         set = "e3";
418         break;
419     case 'g':
420         set = "g96";
421         break;
422     case 'i':
423         set = "il1|!\\/";
424         break;
425     case 'l':
426         set = "li1!|\\/";
427         break;
428     case 'o':
429         set ="o0";
430         break;
431     case 's':
432         // dollarsign $ may not be the first char..
433         set = "s$z5";
434         break;
435     case 't':
436         set = "t+7";
437         break;
438     case 'v':
439         set = "v\\/";
440         break;
441     case 'w':
442         set = "w\\/";
443         break;
444     case 'z':
445         set = "zs";
446         break;
447     default:
448         set = " ";
449         set[0] = c;
450         break;
451     }
452 
453     return set;
454 }
455 
456 
alphabeticSetFromCharacter(const char c) const457 std::string WordFilter::alphabeticSetFromCharacter(const char c) const
458 {
459     std::string set = " ";
460 
461     /* for most punctuation, we include the actual punctuation
462      * last just in case it was really intended
463      */
464     switch (c)
465     {
466     case '!':
467         set = "il";
468         break;
469     case '@':
470         set = "a";
471         break;
472     case '$':
473         set =  "s";
474         break;
475     case '&':
476         set = "s";
477         break;
478     case '(':
479         set = "cil";
480         break;
481     case ')':
482         set = "il";
483         break;
484     case '+':
485         set = "t";
486         break;
487     case '|':
488         set = "li";
489         break;
490     case '\\':
491         set = "li";
492         break;
493     case '{':
494         set = "c";
495         break;
496     case '/':
497         set = "il";
498         break;
499     case '*':
500         set = "aeiou";
501         break;
502     default:
503         set = " ";
504         set[0] = c;
505         break;
506     }
507 
508     return set;
509 }
510 
511 
512 
expressionFromString(const std::string & word) const513 std::string WordFilter::expressionFromString(const std::string &word) const
514 {
515     /* create the regular expression description */
516     std::string expression;
517     unsigned int length = word.length();
518     std::string charSet;
519 
520     /* individual characters expand into a potential set of matchable characters */
521     for (unsigned int i = 0; i < length; i++)
522     {
523 
524         // convert to lowercase for simplicity and speed
525         charSet = l33tspeakSetFromCharacter(tolower(word[i]));
526 
527         /* we specifically will create a regular expression that should at least
528          * match exactly the given input, including any spaces or special
529          * characters.  including spaces or other characters in the input will
530          * make them required to create a match.
531          */
532 
533         /* append multi-letter expansions */
534         if (charSet[0] == 'f')
535         {
536             /* ensure we don't capture non-printables after end of word */
537             if (i != length - 1)
538                 expression.append("[fp]+[^[:alpha:]]*h?[^[:alpha:]]*");
539             else
540                 expression.append("[fp]+h?");
541         }
542         else
543         {
544 
545             if ( charSet.size() >= 1 )
546             {
547                 /* appends characters classes */
548                 expression.append("[");
549                 expression.append(charSet);
550                 expression.append("]");
551             }
552             else if (charSet.size() == 1)
553             {
554                 /* append single characters */
555                 expression.append(charSet);
556             }
557             else
558             {
559                 std::cout << "ERROR: l33t-speak returned an empty string" << std::endl;
560                 std::cout << "ERROR: This should never happen" << std::endl;
561                 exit(1);
562             }
563 
564             /* ensure we don't capture non-printables after end of word. these do
565              * not get appended to the special "f" case.
566              */
567             if (i != length - 1)
568                 expression.append("+[^[:alpha:]]*");
569             else
570                 expression.append("+");
571 
572         } // end test for multi-letter expansions
573 
574     } // end iteration over word letters
575 
576     //  std::cout << "EXP: " <<  expression << std::endl;
577 
578     return expression;
579 }
580 
581 
582 /* public: */
583 
WordFilter()584 WordFilter::WordFilter()
585 {
586     filter_t fix;
587 
588     /* set up the alphabet for simple filtering */
589     alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
590 
591     /* filter characters randomly used to replace filtered text */
592     filterChars = "!@#$%^&*";
593 
594     /* SUFFIXES */
595 
596 #if 1
597     // noun
598     addSuffix("dom");
599     addSuffix("ity");
600     addSuffix("ment");
601     addSuffix("sion");
602     addSuffix("tion");
603     addSuffix("ness");
604     addSuffix("ance");
605     addSuffix("ence");
606     addSuffix("er");
607     addSuffix("or");
608     addSuffix("ist");
609     // adjective
610     addSuffix("ive");
611     addSuffix("ic");
612     addSuffix("al");
613     addSuffix("able");
614     addSuffix("y");
615     addSuffix("ous");
616     addSuffix("ful");
617     addSuffix("less");
618     // verb
619     addSuffix("en");
620     addSuffix("ize");
621     addSuffix("ate");
622     addSuffix("ify");
623     addSuffix("fy");
624     addSuffix("ed");
625     // adverb
626     addSuffix("ly");
627     // slang
628     addSuffix("a");
629     addSuffix("e");
630     addSuffix("i");
631     addSuffix("o");
632     addSuffix("u");
633     addSuffix("z");
634     addSuffix("r");
635     addSuffix("ah");
636     addSuffix("io");
637     addSuffix("rs");
638     addSuffix("rz");
639     addSuffix("in");
640     addSuffix("n");
641     addSuffix("ster");
642     addSuffix("meister");
643     // plurality
644     addSuffix("s");
645     addSuffix("es");
646     // imperfect verb
647     addSuffix("ing");
648     // diminutive
649     addSuffix("let");
650 
651 #endif
652 
653     /* PREFIXES */
654 
655     // bz-specific
656 
657 #if 1
658     /* XXX adding prefixes _significantly_ increases the expression count
659      * and is rather expensive (slow, XN+N extra checks for N words)
660      */
661     addPrefix("bz"); // bz-specific prefix
662     addPrefix("beze"); // bz-specific prefix
663     addPrefix("u"); // l33t prefix
664     addPrefix("you");
665     addPrefix("ura"); // l33t prefix
666     addPrefix("k"); // common l33t prefix
667 
668 #endif
669 
670     return;
671 }
672 
673 /** default copy constructor */
WordFilter(const WordFilter & _filter)674 WordFilter::WordFilter(const WordFilter& _filter)
675     : alphabet(_filter.alphabet),
676       filterChars(_filter.filterChars),
677       suffixes(_filter.suffixes),
678       prefixes(_filter.prefixes)
679 {
680     for (int i=0; i < MAX_FILTER_SETS; i++)
681         filters[i] = _filter.filters[i];
682 }
683 
684 
685 
686 /** destructor releases the compiled bad words */
~WordFilter(void)687 WordFilter::~WordFilter(void)
688 {
689     ExpCompareSet::iterator i;
690 
691     // delete compiled words
692     for (int j = 0; j < MAX_FILTER_SETS; j++)
693     {
694         for (i = filters[j].begin();
695                 i != filters[j].end();
696                 ++i)
697         {
698             if (i->compiled)
699             {
700                 regfree(i->compiled);
701                 free(i->compiled);
702             }
703         }
704     }
705     // delete compiled prefixes
706     for (i = prefixes.begin();
707             i != prefixes.end();
708             ++i)
709     {
710         if (i->compiled)
711         {
712             regfree(i->compiled);
713             free(i->compiled);
714         }
715     }
716     // delete compiled suffixes
717     for (i = suffixes.begin();
718             i != suffixes.end();
719             ++i)
720     {
721         if (i->compiled)
722         {
723             regfree(i->compiled);
724             free(i->compiled);
725         }
726     }
727 
728     return;
729 }
730 
731 
732 // adds an individual word to the filter list
addToFilter(const std::string & word,const std::string & expression)733 bool WordFilter::addToFilter(const std::string &word, const std::string &expression)
734 {
735     long int length = (long int)word.length();
736     if (0 >= length)
737     {
738         std::cerr << "Someone tried to add an empty word to the filter" << std::endl;
739         return false;
740     } // end check if word is empty
741 
742     if (expression.size() == 0)
743     {
744         /* make sure to create an expression if it wasn't given */
745         std::string expr = expressionFromString(word);
746         return addToFilter(word, expr);
747 
748     }
749     else
750     {
751         /* base case */
752         filter_t newFilter;
753 
754         newFilter.word = word;
755         newFilter.compiled = getCompiledExpression(expression);
756 
757         unsigned int firstchar = (unsigned char)tolower(word[0]);
758         /* check if the word is already added */
759         if (filters[firstchar].find(newFilter) != \
760                 filters[firstchar].end())
761         {
762             regfree(newFilter.compiled);
763             free(newFilter.compiled);
764             return false;
765         }
766         else
767             filters[firstchar].insert(newFilter);
768         return true;
769     }
770 } // end addToFilter
771 
772 
773 /** loads a set of bad words from a specified file */
loadFromFile(const std::string & fileName,bool verbose)774 unsigned int WordFilter::loadFromFile(const std::string &fileName, bool verbose)
775 {
776     char buffer[2048];
777     unsigned int totalAdded=0;
778     std::ifstream filterStream(fileName.c_str());
779 
780     if (!filterStream)
781     {
782         if (verbose)
783             std::cerr << "Warning: '" << fileName << "' bad word filter file not found" << std::endl;
784         return 0;
785     }
786 
787     while (filterStream.good())
788     {
789         filterStream.getline(buffer,2048);
790 
791         std::string filterWord = buffer;
792 
793         int position = filterWord.find_first_not_of("\r\n\t ");
794 
795         // trim leading whitespace
796         if (position > 0)
797             filterWord = filterWord.substr(position);
798 
799         position = filterWord.find_first_of("#\r\n");
800 
801         // trim trailing comments
802         if ((position >= 0) && (position < (int)filterWord.length()))
803             filterWord = filterWord.substr(0, position);
804 
805         position = filterWord.find_last_not_of(" \t\n\r");
806         // first whitespace is at next character position
807         position += 1;
808 
809         // trim trailing whitespace
810         if ((position >=0) && (position < (int)filterWord.length()))
811             filterWord = filterWord.substr(0, position);
812 
813         /* make sure the word isn't empty (e.g. comment lines) */
814         if (filterWord.length() == 0)
815             continue;
816 
817         /*
818           std::cout << "[[[" <<  filterWord << "]]]" << std::endl;
819         */
820 
821         if (verbose)
822         {
823             static int counter=0;
824             if (counter-- <= 0)
825             {
826                 std::cout << ".";
827                 std::cout.flush();
828                 counter=100;
829             }
830         }
831 
832         // convert the word to lowercase
833         std::transform (filterWord.begin(),filterWord.end(), filterWord.begin(), tolower);
834 
835         bool added = addToFilter(filterWord, std::string(""));
836         if ((!added) && (verbose))
837             std::cout << std::endl << "Word is already added: " << filterWord << std::endl;
838         else
839             totalAdded++;
840 
841     } // end iteration over input file
842     if (verbose)
843         std::cout << std::endl;
844 
845     return totalAdded;
846 } // end loadFromFile
847 
848 /** filters an input message either a complex regular expression-based
849  * pattern match (default) catching hundreds of variations per filter
850  * word or using a simple exact word match technique (original).
851  */
filter(char * input,const bool simple) const852 bool WordFilter::filter(char *input, const bool simple) const
853 {
854 #ifdef DEBUG
855     TimeKeeper before = TimeKeeper::getCurrent();
856 #endif
857     bool filtered;
858     if (simple)
859         filtered = simpleFilter(input);
860     else
861         filtered = aggressiveFilter(input);
862 #ifdef DEBUG
863     TimeKeeper after = TimeKeeper::getCurrent();
864     std::cout << "Time elapsed: " << after - before << " seconds" << std::endl;
865 #endif
866     return filtered;
867 }
868 
filter(std::string & input,const bool simple) const869 bool WordFilter::filter(std::string &input, const bool simple) const
870 {
871     char input2[512];
872     bool filtered = false;
873     std::string resultString = "";
874 
875     /* filter in 512 chunks.  ugly means to support large input strings,
876      * but it works.  just means words that span the boundary might be
877      * wrong.
878      */
879     for (unsigned int i = 0; i < input.size(); i+=511)
880     {
881         strncpy(input2, input.c_str() + i, 511);
882         input2[511] = '\0';
883         bool filteredChunk = filter(input2, simple);
884         if (filteredChunk)
885             filtered = true;
886         resultString += input2;
887     }
888     if (filtered)
889         input = resultString;
890     return filtered;
891 }
892 
outputFilter(void) const893 void WordFilter::outputFilter(void) const
894 {
895     for (int i=0; i < MAX_FILTER_SETS; ++i)
896     {
897         int count=0;
898         for (ExpCompareSet::const_iterator j = filters[i].begin(); \
899                 j != filters[i].end(); \
900                 ++j)
901         {
902             std::string jword = j->word;
903             std::cout << count++ << ": " << jword << std::endl;
904             std::cout << "    " << expressionFromString(jword) << std::endl;
905         }
906     }
907 
908 }
outputWords(void) const909 void WordFilter::outputWords(void) const
910 {
911     //        std::cout << "size of compiled set is " << () << std::endl;
912     for (int i=0; i < MAX_FILTER_SETS; ++i)
913     {
914         int count=0;
915         for (ExpCompareSet::const_iterator j = filters[i].begin(); \
916                 j != filters[i].end(); \
917                 ++j)
918             std::cout << "[" << i << "] " << count++ << ": " << j->word << std::endl;
919     }
920 
921 }
wordCount(void) const922 unsigned long int WordFilter::wordCount(void) const
923 {
924     int count=0;
925     for (int i=0; i < MAX_FILTER_SETS; ++i)
926     {
927         for (ExpCompareSet::const_iterator j = filters[i].begin(); \
928                 j != filters[i].end(); \
929                 ++j)
930             count += 1;
931     }
932     return count;
933 }
934 
clear(void)935 void WordFilter::clear(void)
936 {
937     for (int i = 0; i < MAX_FILTER_SETS; i++)
938         filters[i].clear();
939 
940     suffixes.clear();
941     prefixes.clear();
942 }
943 
944 // Local Variables: ***
945 // mode: C++ ***
946 // tab-width: 4 ***
947 // c-basic-offset: 4 ***
948 // indent-tabs-mode: nil ***
949 // End: ***
950 // ex: shiftwidth=4 tabstop=4
951