1 /* bzflag
2 * Copyright (c) 1993-2021 Tim Riker
3 *
4 * This package is free software; you can redistribute it and/or
5 * modify it under the terms of the license found in the file
6 * named COPYING that should have accompanied this file.
7 *
8 * THIS PACKAGE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
9 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
10 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
11 */
12
13 // this classes interface
14 #include "WordFilter.h"
15
16 // system headers
17 #include <ctype.h>
18 #include <string>
19
20 // implementation-specific headers
21 #ifdef DEBUG
22 # include "TimeKeeper.h"
23 #endif
24
25
26 /* private */
27
28 /* protected */
29
simpleFilter(char * input) const30 bool WordFilter::simpleFilter(char *input) const
31 {
32 bool filtered = false;
33 filter_t findWord;
34
35 std::string line = input;
36 int startPosition = line.find_first_of(alphabet);
37
38 int endPosition;
39 std::string word;
40 unsigned int firstchar;
41
42 /* here we iterate over all of the words in the input and replace
43 * exact matches with asterisks
44 */
45 while (startPosition >= 0)
46 {
47 endPosition = line.find_first_not_of(alphabet, startPosition+1);
48 if (endPosition < 0)
49 endPosition = line.length();
50
51 word = line.substr(startPosition, endPosition-startPosition);
52 // transform to lowercase
53 std::transform (word.begin(), word.end(), word.begin(), tolower);
54
55 findWord.word = word;
56 firstchar = (unsigned char)word[0];
57 if (filters[firstchar].find(findWord) != \
58 filters[firstchar].end())
59 {
60
61 /* fill with asterisks */
62 // memset(input+startPosition,'*', endPosition-startPosition);
63
64 /* fill with random filter chars */
65 if (filterCharacters(input, startPosition, endPosition-startPosition, true) > 0)
66 filtered=true;
67 }
68 startPosition = line.find_first_of(alphabet, endPosition);
69 }
70 return filtered;
71 } // end simpleFilter
72
73
aggressiveFilter(char * input) const74 bool WordFilter::aggressiveFilter(char *input) const
75 {
76 #if !defined(HAVE_REGEX_H)
77
78 std::cerr << "Regular expressions are not available (using the simple filter)" << std::endl;
79 return simpleFilter(input);
80
81 #else /* HAVE_REGEX_H */
82
83 bool filtered = false;
84 regmatch_t match[1];
85 if (input == NULL) return false;
86 int inputLength = strlen(input);
87
88 // a buffer to destroy during matching (includes terminating null)
89 std::string sInput = input;
90
91 /* maintain an array of match indices of the input; values are in
92 * pairs. the first number is the start position, the second is
93 * a length. array has initial size of 256 (128 words).
94 */
95 std::vector<int> matchPair(MAX_FILTER_SETS);
96
97 // how many match pairs are stored
98 unsigned int matchCount = 0;
99
100 // stores which letters have been used already
101 std::string wordIndices;
102
103 /* here, we get a list of characters that might be the start of a word.
104 * characters that are preceded by a non-alphabetic character are added
105 * to a list of letters. e.g. the first line of this comment consists
106 * of the letters: hwgaloctmbs
107 * This means that we will only search through filter words that begin
108 * with those letters.
109 */
110 char previousChar = 0;
111 for (int counter = 0; counter < inputLength; counter++)
112 {
113 char c = tolower(sInput[counter]);
114
115 if (!isalpha(previousChar) && TextUtils::isVisible(c))
116 {
117
118 // expand punctuation to potential alphabetic characters
119 if (TextUtils::isPunctuation(c))
120 {
121
122 std::string puncChars = alphabeticSetFromCharacter(c);
123 for (unsigned int cnt = 0; cnt < puncChars.size(); cnt++)
124 appendUniqueChar(wordIndices, tolower(puncChars[cnt]));
125 }
126
127 // add the character itself if not previously added
128 appendUniqueChar(wordIndices, c);
129 }
130 previousChar = c;
131 }
132
133 /* check the input for a words that begin with a prefix. if it does,
134 * then add the next letter as a letter to check for during matching.
135 * e.g. "bzstring" will match a prefix of "bz" and make "s" get added
136 * as a bin to check for during matching.
137 */
138 for (ExpCompareSet::const_iterator i = prefixes.begin(); i != prefixes.end(); ++i)
139 {
140 if (regexec(i->compiled, sInput.c_str(), 1, match, 0) == 0)
141 {
142 if ( (match[0].rm_eo < inputLength) && isalpha(sInput[match[0].rm_eo]) )
143 {
144 /* do not forget to make sure this is a true prefix */
145 if ( (match[0].rm_so > 0) && isalpha(sInput[match[0].rm_so - 1]) )
146 continue;
147
148 /* we found a prefix -- add the letter that follows */
149 appendUniqueChar(wordIndices, tolower(sInput[match[0].rm_eo]));
150 }
151 }
152 }
153
154
155 //std::cout << "WordIndexLetters are [" << wordIndices << "]" << std::endl;
156 // now we have a record of all potential word boundary positions
157
158
159 /* iterate over the filter words for each unique initial word character */
160 int regCode;
161 for (unsigned int j = 0; j < wordIndices.size(); j++)
162 {
163
164 /* look at all of the filters that start with the letter wordIndices[j]
165 */
166 const unsigned int firstchar = (unsigned char)wordIndices[j];
167 for (ExpCompareSet::const_iterator i = filters[firstchar].begin();
168 i != filters[firstchar].end(); ++i)
169 {
170
171 /* the big kahuna burger processing goes on here */
172 bool matched = true;
173 while (matched)
174 {
175 matched = false;
176
177 regCode = regexec(i->compiled, sInput.c_str(), 1, match, 0);
178
179 //std::cout << "input is [" << sInput << "]" << std::endl;
180
181 if ( regCode == 0 )
182 {
183 int startOffset = match[0].rm_so;
184 int endOffset = match[0].rm_eo;
185
186 //std::cout << "We matched ... ";
187
188 /* make sure we only match on word boundaries */
189 if ( (startOffset>1) && (isalpha(sInput[startOffset-1])) )
190 {
191
192 //std::cout << "but didn't match a word beginning" << std::endl;
193
194 /* we are in the middle of a word.. see if we can match a prefix before this */
195 bool foundit = false;
196 for (ExpCompareSet::const_iterator k = prefixes.begin();
197 k != prefixes.end(); ++k)
198 {
199 if (regexec(k->compiled, sInput.c_str(), 1, match, 0) == 0)
200 {
201
202 //std::cout << "checking prefix: " << k->word << std::endl;
203
204 if ( (match[0].rm_so > 1) && (isalpha(sInput[match[0].rm_so - 1])) )
205 {
206 /* we matched, but we are still in the middle of a word */
207 continue;
208 }
209
210 //std::cout << "matched a prefix! " << k->word << std::endl;
211 if (match[0].rm_eo == startOffset)
212 {
213 /* perfect prefix match */
214 startOffset = match[0].rm_so;
215 foundit = true;
216 break;
217 }
218 }
219 }
220 if (!foundit)
221 {
222 /* couldn't find a prefix, so skip this match */
223 //std::cout << "Could not find a prefix" <<std::endl;
224 continue;
225 }
226 }
227
228 //std::cout << "is endoffset alphabetic: " << input[endOffset] << std::endl;
229
230 if ( (endOffset<inputLength-1) && (isalpha(sInput[endOffset])) )
231 {
232
233 //std::cout << "but didn't match a word ending" << std::endl;
234
235 /* we are at the start of a word, but not at the end, try to get to the end */
236 bool foundit = false;
237 for (ExpCompareSet::const_iterator k = suffixes.begin();
238 k != suffixes.end(); ++k)
239 {
240 //std::cout << "checking " << k->word << " against [" << input + endOffset << "]" << std::endl;
241
242 if (regexec(k->compiled, sInput.c_str() + endOffset, 1, match, 0) == 0)
243 {
244
245 //std::cout << "is " << match[0].rm_eo << " less than " << inputLength - endOffset << std::endl;
246 //std::cout << "is alpha =?= " << input[endOffset + match[0].rm_eo + 1] << std::endl;
247
248 /* again, make sure we are now at a word end */
249 if ( (match[0].rm_eo < inputLength - endOffset) &&
250 (isalpha(sInput[endOffset + match[0].rm_eo])) )
251 {
252 /* we matched, but we are still in the middle of a word */
253 continue;
254 }
255
256 //std::cout << "matched a suffix! " << k->word << std::endl;
257 if (match[0].rm_so == 0)
258 {
259 /* push the end forward a little since we matched */
260 endOffset += match[0].rm_eo;
261 foundit = true;
262 break;
263 }
264 }
265 }
266 if (!foundit)
267 {
268 /* couldn't find a suffix, so skip this match */
269 //std::cout << "Could not find a suffix" <<std::endl;
270 continue;
271 }
272 }
273
274 int matchLength = endOffset - startOffset;
275
276 // make sure that longer matches actually include at least 1 alphabetic
277 if (matchLength > 3)
278 {
279 bool foundAlpha = false;
280 for (std::string::const_iterator position = sInput.begin(); position != sInput.end(); position++)
281 {
282 if (isalpha(*position))
283 {
284 foundAlpha = true;
285 break;
286 }
287 }
288 if (!foundAlpha)
289 continue;
290 }
291
292 // add a few more slots if necessary (this should be rare/never)
293 if (matchCount * 2 + 1 >= matchPair.size())
294 matchPair.resize(matchCount * 2 + 201);
295
296 matchPair[matchCount * 2] = startOffset; /* position */
297 matchPair[(matchCount * 2) + 1] = matchLength; /* length */
298 matchCount++;
299 filtered = true;
300 matched = true;
301 // zappo! .. erase stuff that has been filtered to speed up future checks
302 // fill with some non-whitespace alpha that is not the start/end of a suffix to prevent rematch
303 std::string filler;
304 filler.assign(matchLength, 'W');
305 sInput.replace(startOffset, matchLength, filler);
306
307 }
308 else if ( regCode == REG_NOMATCH )
309 {
310 // do nothing
311 continue;
312
313 }
314 else
315 {
316 char errorBuffer[256];
317 regerror(regCode, i->compiled, errorBuffer, 256);
318 std::cout << errorBuffer << std::endl;
319
320 } /* end if regcode */
321
322 } /* end regexec-ing */
323
324 } /* iterate over words in a particular character bin */
325
326 } /* iterate over characters */
327
328 /* finally filter the input. only filter actual alphanumerics. */
329 for (unsigned int l = 0; l < matchCount; l++)
330 {
331 /* !!! debug */
332 #ifdef DEBUG
333 char tmp[256] = {0};
334 strncpy(tmp, input + matchPair[l*2], matchPair[(l*2)+1]);
335 std::cout << "Matched: [" << tmp << "]" << std::endl;
336 #endif
337
338 if (filterCharacters(input, matchPair[l*2], matchPair[(l*2)+1]) <= 0)
339 {
340 // XXX with multiple matching, we will be unable to filter overlapping matches
341 // std::cerr << "Unable to filter characters" << std::endl;
342 continue;
343 }
344 }
345
346
347 return filtered;
348
349 #endif /* HAVE_REGEX_H */
350 } // end aggressiveFilter
351
352
353 // provides a pointer to a fresh compiled expression for some given expression
354 #if !defined(HAVE_REGEX_H)
getCompiledExpression(const std::string &) const355 regex_t *WordFilter::getCompiledExpression(const std::string &) const
356 {
357 return (regex_t *)NULL;
358 }
359 #else /* HAVE_REGEX_H */
getCompiledExpression(const std::string & word) const360 regex_t *WordFilter::getCompiledExpression(const std::string &word) const
361 {
362 regex_t *compiledReg;
363
364 /* XXX need to convert this to use new/delete */
365 if ( (compiledReg = (regex_t *)calloc(1, sizeof(regex_t))) == NULL )
366 {
367
368 perror("calloc failed");
369 std::cerr << "Warning: unable to allocate memory for compiled regular expression";
370 return (regex_t *)NULL;
371
372 }
373
374 if ( regcomp(compiledReg, word.c_str(), REG_EXTENDED | REG_ICASE) != 0 )
375 {
376 std::cerr << "Warning: unable to compile regular expression for [" << word << "]" << std::endl;
377 free(compiledReg);
378 return (regex_t *)NULL;
379 }
380 return compiledReg;
381
382 }
383 #endif /* HAVE_REGEX_H */
384
385
l33tspeakSetFromCharacter(const char c) const386 std::string WordFilter::l33tspeakSetFromCharacter(const char c) const
387 {
388 std::string set = "";
389
390 if (!isalnum(c))
391 {
392 /* escape the non-alphanumeric (punctuation or control chars) */
393 set = " ";
394 set[0] = '\\';
395 set[1] = c;
396 return set;
397 }
398 else if (isspace(c))
399 {
400 set = " ";
401 set[0] = c;
402 return set;
403 }
404
405 switch (c)
406 {
407 case 'a':
408 set = "a4@";
409 break;
410 case 'b':
411 set = "b8";
412 break;
413 case 'c':
414 set = "c\\(";
415 break;
416 case 'e':
417 set = "e3";
418 break;
419 case 'g':
420 set = "g96";
421 break;
422 case 'i':
423 set = "il1|!\\/";
424 break;
425 case 'l':
426 set = "li1!|\\/";
427 break;
428 case 'o':
429 set ="o0";
430 break;
431 case 's':
432 // dollarsign $ may not be the first char..
433 set = "s$z5";
434 break;
435 case 't':
436 set = "t+7";
437 break;
438 case 'v':
439 set = "v\\/";
440 break;
441 case 'w':
442 set = "w\\/";
443 break;
444 case 'z':
445 set = "zs";
446 break;
447 default:
448 set = " ";
449 set[0] = c;
450 break;
451 }
452
453 return set;
454 }
455
456
alphabeticSetFromCharacter(const char c) const457 std::string WordFilter::alphabeticSetFromCharacter(const char c) const
458 {
459 std::string set = " ";
460
461 /* for most punctuation, we include the actual punctuation
462 * last just in case it was really intended
463 */
464 switch (c)
465 {
466 case '!':
467 set = "il";
468 break;
469 case '@':
470 set = "a";
471 break;
472 case '$':
473 set = "s";
474 break;
475 case '&':
476 set = "s";
477 break;
478 case '(':
479 set = "cil";
480 break;
481 case ')':
482 set = "il";
483 break;
484 case '+':
485 set = "t";
486 break;
487 case '|':
488 set = "li";
489 break;
490 case '\\':
491 set = "li";
492 break;
493 case '{':
494 set = "c";
495 break;
496 case '/':
497 set = "il";
498 break;
499 case '*':
500 set = "aeiou";
501 break;
502 default:
503 set = " ";
504 set[0] = c;
505 break;
506 }
507
508 return set;
509 }
510
511
512
expressionFromString(const std::string & word) const513 std::string WordFilter::expressionFromString(const std::string &word) const
514 {
515 /* create the regular expression description */
516 std::string expression;
517 unsigned int length = word.length();
518 std::string charSet;
519
520 /* individual characters expand into a potential set of matchable characters */
521 for (unsigned int i = 0; i < length; i++)
522 {
523
524 // convert to lowercase for simplicity and speed
525 charSet = l33tspeakSetFromCharacter(tolower(word[i]));
526
527 /* we specifically will create a regular expression that should at least
528 * match exactly the given input, including any spaces or special
529 * characters. including spaces or other characters in the input will
530 * make them required to create a match.
531 */
532
533 /* append multi-letter expansions */
534 if (charSet[0] == 'f')
535 {
536 /* ensure we don't capture non-printables after end of word */
537 if (i != length - 1)
538 expression.append("[fp]+[^[:alpha:]]*h?[^[:alpha:]]*");
539 else
540 expression.append("[fp]+h?");
541 }
542 else
543 {
544
545 if ( charSet.size() >= 1 )
546 {
547 /* appends characters classes */
548 expression.append("[");
549 expression.append(charSet);
550 expression.append("]");
551 }
552 else if (charSet.size() == 1)
553 {
554 /* append single characters */
555 expression.append(charSet);
556 }
557 else
558 {
559 std::cout << "ERROR: l33t-speak returned an empty string" << std::endl;
560 std::cout << "ERROR: This should never happen" << std::endl;
561 exit(1);
562 }
563
564 /* ensure we don't capture non-printables after end of word. these do
565 * not get appended to the special "f" case.
566 */
567 if (i != length - 1)
568 expression.append("+[^[:alpha:]]*");
569 else
570 expression.append("+");
571
572 } // end test for multi-letter expansions
573
574 } // end iteration over word letters
575
576 // std::cout << "EXP: " << expression << std::endl;
577
578 return expression;
579 }
580
581
582 /* public: */
583
WordFilter()584 WordFilter::WordFilter()
585 {
586 filter_t fix;
587
588 /* set up the alphabet for simple filtering */
589 alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
590
591 /* filter characters randomly used to replace filtered text */
592 filterChars = "!@#$%^&*";
593
594 /* SUFFIXES */
595
596 #if 1
597 // noun
598 addSuffix("dom");
599 addSuffix("ity");
600 addSuffix("ment");
601 addSuffix("sion");
602 addSuffix("tion");
603 addSuffix("ness");
604 addSuffix("ance");
605 addSuffix("ence");
606 addSuffix("er");
607 addSuffix("or");
608 addSuffix("ist");
609 // adjective
610 addSuffix("ive");
611 addSuffix("ic");
612 addSuffix("al");
613 addSuffix("able");
614 addSuffix("y");
615 addSuffix("ous");
616 addSuffix("ful");
617 addSuffix("less");
618 // verb
619 addSuffix("en");
620 addSuffix("ize");
621 addSuffix("ate");
622 addSuffix("ify");
623 addSuffix("fy");
624 addSuffix("ed");
625 // adverb
626 addSuffix("ly");
627 // slang
628 addSuffix("a");
629 addSuffix("e");
630 addSuffix("i");
631 addSuffix("o");
632 addSuffix("u");
633 addSuffix("z");
634 addSuffix("r");
635 addSuffix("ah");
636 addSuffix("io");
637 addSuffix("rs");
638 addSuffix("rz");
639 addSuffix("in");
640 addSuffix("n");
641 addSuffix("ster");
642 addSuffix("meister");
643 // plurality
644 addSuffix("s");
645 addSuffix("es");
646 // imperfect verb
647 addSuffix("ing");
648 // diminutive
649 addSuffix("let");
650
651 #endif
652
653 /* PREFIXES */
654
655 // bz-specific
656
657 #if 1
658 /* XXX adding prefixes _significantly_ increases the expression count
659 * and is rather expensive (slow, XN+N extra checks for N words)
660 */
661 addPrefix("bz"); // bz-specific prefix
662 addPrefix("beze"); // bz-specific prefix
663 addPrefix("u"); // l33t prefix
664 addPrefix("you");
665 addPrefix("ura"); // l33t prefix
666 addPrefix("k"); // common l33t prefix
667
668 #endif
669
670 return;
671 }
672
673 /** default copy constructor */
WordFilter(const WordFilter & _filter)674 WordFilter::WordFilter(const WordFilter& _filter)
675 : alphabet(_filter.alphabet),
676 filterChars(_filter.filterChars),
677 suffixes(_filter.suffixes),
678 prefixes(_filter.prefixes)
679 {
680 for (int i=0; i < MAX_FILTER_SETS; i++)
681 filters[i] = _filter.filters[i];
682 }
683
684
685
686 /** destructor releases the compiled bad words */
~WordFilter(void)687 WordFilter::~WordFilter(void)
688 {
689 ExpCompareSet::iterator i;
690
691 // delete compiled words
692 for (int j = 0; j < MAX_FILTER_SETS; j++)
693 {
694 for (i = filters[j].begin();
695 i != filters[j].end();
696 ++i)
697 {
698 if (i->compiled)
699 {
700 regfree(i->compiled);
701 free(i->compiled);
702 }
703 }
704 }
705 // delete compiled prefixes
706 for (i = prefixes.begin();
707 i != prefixes.end();
708 ++i)
709 {
710 if (i->compiled)
711 {
712 regfree(i->compiled);
713 free(i->compiled);
714 }
715 }
716 // delete compiled suffixes
717 for (i = suffixes.begin();
718 i != suffixes.end();
719 ++i)
720 {
721 if (i->compiled)
722 {
723 regfree(i->compiled);
724 free(i->compiled);
725 }
726 }
727
728 return;
729 }
730
731
732 // adds an individual word to the filter list
addToFilter(const std::string & word,const std::string & expression)733 bool WordFilter::addToFilter(const std::string &word, const std::string &expression)
734 {
735 long int length = (long int)word.length();
736 if (0 >= length)
737 {
738 std::cerr << "Someone tried to add an empty word to the filter" << std::endl;
739 return false;
740 } // end check if word is empty
741
742 if (expression.size() == 0)
743 {
744 /* make sure to create an expression if it wasn't given */
745 std::string expr = expressionFromString(word);
746 return addToFilter(word, expr);
747
748 }
749 else
750 {
751 /* base case */
752 filter_t newFilter;
753
754 newFilter.word = word;
755 newFilter.compiled = getCompiledExpression(expression);
756
757 unsigned int firstchar = (unsigned char)tolower(word[0]);
758 /* check if the word is already added */
759 if (filters[firstchar].find(newFilter) != \
760 filters[firstchar].end())
761 {
762 regfree(newFilter.compiled);
763 free(newFilter.compiled);
764 return false;
765 }
766 else
767 filters[firstchar].insert(newFilter);
768 return true;
769 }
770 } // end addToFilter
771
772
773 /** loads a set of bad words from a specified file */
loadFromFile(const std::string & fileName,bool verbose)774 unsigned int WordFilter::loadFromFile(const std::string &fileName, bool verbose)
775 {
776 char buffer[2048];
777 unsigned int totalAdded=0;
778 std::ifstream filterStream(fileName.c_str());
779
780 if (!filterStream)
781 {
782 if (verbose)
783 std::cerr << "Warning: '" << fileName << "' bad word filter file not found" << std::endl;
784 return 0;
785 }
786
787 while (filterStream.good())
788 {
789 filterStream.getline(buffer,2048);
790
791 std::string filterWord = buffer;
792
793 int position = filterWord.find_first_not_of("\r\n\t ");
794
795 // trim leading whitespace
796 if (position > 0)
797 filterWord = filterWord.substr(position);
798
799 position = filterWord.find_first_of("#\r\n");
800
801 // trim trailing comments
802 if ((position >= 0) && (position < (int)filterWord.length()))
803 filterWord = filterWord.substr(0, position);
804
805 position = filterWord.find_last_not_of(" \t\n\r");
806 // first whitespace is at next character position
807 position += 1;
808
809 // trim trailing whitespace
810 if ((position >=0) && (position < (int)filterWord.length()))
811 filterWord = filterWord.substr(0, position);
812
813 /* make sure the word isn't empty (e.g. comment lines) */
814 if (filterWord.length() == 0)
815 continue;
816
817 /*
818 std::cout << "[[[" << filterWord << "]]]" << std::endl;
819 */
820
821 if (verbose)
822 {
823 static int counter=0;
824 if (counter-- <= 0)
825 {
826 std::cout << ".";
827 std::cout.flush();
828 counter=100;
829 }
830 }
831
832 // convert the word to lowercase
833 std::transform (filterWord.begin(),filterWord.end(), filterWord.begin(), tolower);
834
835 bool added = addToFilter(filterWord, std::string(""));
836 if ((!added) && (verbose))
837 std::cout << std::endl << "Word is already added: " << filterWord << std::endl;
838 else
839 totalAdded++;
840
841 } // end iteration over input file
842 if (verbose)
843 std::cout << std::endl;
844
845 return totalAdded;
846 } // end loadFromFile
847
848 /** filters an input message either a complex regular expression-based
849 * pattern match (default) catching hundreds of variations per filter
850 * word or using a simple exact word match technique (original).
851 */
filter(char * input,const bool simple) const852 bool WordFilter::filter(char *input, const bool simple) const
853 {
854 #ifdef DEBUG
855 TimeKeeper before = TimeKeeper::getCurrent();
856 #endif
857 bool filtered;
858 if (simple)
859 filtered = simpleFilter(input);
860 else
861 filtered = aggressiveFilter(input);
862 #ifdef DEBUG
863 TimeKeeper after = TimeKeeper::getCurrent();
864 std::cout << "Time elapsed: " << after - before << " seconds" << std::endl;
865 #endif
866 return filtered;
867 }
868
filter(std::string & input,const bool simple) const869 bool WordFilter::filter(std::string &input, const bool simple) const
870 {
871 char input2[512];
872 bool filtered = false;
873 std::string resultString = "";
874
875 /* filter in 512 chunks. ugly means to support large input strings,
876 * but it works. just means words that span the boundary might be
877 * wrong.
878 */
879 for (unsigned int i = 0; i < input.size(); i+=511)
880 {
881 strncpy(input2, input.c_str() + i, 511);
882 input2[511] = '\0';
883 bool filteredChunk = filter(input2, simple);
884 if (filteredChunk)
885 filtered = true;
886 resultString += input2;
887 }
888 if (filtered)
889 input = resultString;
890 return filtered;
891 }
892
outputFilter(void) const893 void WordFilter::outputFilter(void) const
894 {
895 for (int i=0; i < MAX_FILTER_SETS; ++i)
896 {
897 int count=0;
898 for (ExpCompareSet::const_iterator j = filters[i].begin(); \
899 j != filters[i].end(); \
900 ++j)
901 {
902 std::string jword = j->word;
903 std::cout << count++ << ": " << jword << std::endl;
904 std::cout << " " << expressionFromString(jword) << std::endl;
905 }
906 }
907
908 }
outputWords(void) const909 void WordFilter::outputWords(void) const
910 {
911 // std::cout << "size of compiled set is " << () << std::endl;
912 for (int i=0; i < MAX_FILTER_SETS; ++i)
913 {
914 int count=0;
915 for (ExpCompareSet::const_iterator j = filters[i].begin(); \
916 j != filters[i].end(); \
917 ++j)
918 std::cout << "[" << i << "] " << count++ << ": " << j->word << std::endl;
919 }
920
921 }
wordCount(void) const922 unsigned long int WordFilter::wordCount(void) const
923 {
924 int count=0;
925 for (int i=0; i < MAX_FILTER_SETS; ++i)
926 {
927 for (ExpCompareSet::const_iterator j = filters[i].begin(); \
928 j != filters[i].end(); \
929 ++j)
930 count += 1;
931 }
932 return count;
933 }
934
clear(void)935 void WordFilter::clear(void)
936 {
937 for (int i = 0; i < MAX_FILTER_SETS; i++)
938 filters[i].clear();
939
940 suffixes.clear();
941 prefixes.clear();
942 }
943
944 // Local Variables: ***
945 // mode: C++ ***
946 // tab-width: 4 ***
947 // c-basic-offset: 4 ***
948 // indent-tabs-mode: nil ***
949 // End: ***
950 // ex: shiftwidth=4 tabstop=4
951