1 /**
2  * This file is part of the Detox package.
3  *
4  * Copyright (c) Doug Harple <detox.dharple@gmail.com>
5  *
6  * For the full copyright and license information, please view the LICENSE
7  * file that was distributed with this source code.
8  */
9 
10 #include "config.h"
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <errno.h>
17 
18 #include "clean_string.h"
19 
20 /* translation array for ISO8859.1 characters */
21 #include "iso8859_1.h"
22 
23 /* translation array for unicode characters */
24 #include "unicode.h"
25 
26 #include "parse_table.h"
27 #include "table.h"
28 
29 
30 /*
31  * Translates ISO8859.1 characters (Latin-1) into lower ASCII characters.
32  */
clean_iso8859_1_basic(unsigned char * s,void * opts)33 unsigned char *clean_iso8859_1_basic(unsigned char *s, void *opts)
34 {
35 	unsigned char *output, *input_walk, *output_walk, *replace_walk;
36 	int replace_pos;
37 
38 	if (s == NULL) {
39 		return NULL;
40 	}
41 
42 	output = malloc((strlen(s) * ISO8859_1_MAXLEN) + 1);
43 	if (output == NULL) {
44 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
45 		return NULL;
46 	}
47 
48 	input_walk = s;
49 	output_walk = output;
50 
51 	while (*input_walk != '\0') {
52 		if (*input_walk >= ISO8859_1_OFFSET) {
53 			replace_pos = *input_walk - ISO8859_1_OFFSET;
54 			replace_walk = (unsigned char *)&iso8859_1_trans[replace_pos];
55 
56 			while (*replace_walk != '\0') {
57 				*output_walk++ = *replace_walk++;
58 			}
59 			input_walk++;
60 		}
61 		else {
62 			*output_walk++ = *input_walk++;
63 		}
64 	}
65 
66 	*output_walk = '\0';
67 
68 	return output;
69 }
70 
71 /*
72  * Translates ISO8859.1 characters (Latin-1) into lower ASCII characters.
73  */
clean_iso8859_1(unsigned char * s,void * opts)74 unsigned char *clean_iso8859_1(unsigned char *s, void *opts)
75 {
76 	unsigned char *output, *input_walk, *output_walk, *replace_walk;
77 
78 	struct translation_table *table = NULL;
79 	struct clean_string_options *options = NULL;
80 
81 	if (s == NULL) {
82 		return NULL;
83 	}
84 
85 	if (opts == NULL) {
86 		fprintf(stderr, "this shouldn't happen\n");
87 		exit(EXIT_FAILURE);
88 	}
89 
90 	options = (struct clean_string_options *)opts;
91 	table = options->translation_table;
92 
93 	output = malloc((strlen(s) * table->max_data_length) + 1);
94 	if (output == NULL) {
95 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
96 		return NULL;
97 	}
98 
99 	input_walk = s;
100 	output_walk = output;
101 
102 	while (*input_walk != '\0') {
103 		if (*input_walk >= ISO8859_1_OFFSET) {
104 			replace_walk = table_get(table, *input_walk);
105 			if (replace_walk == NULL) {
106 				if (table->default_translation == NULL) {
107 					/*
108 					 * Null translation == leave it alone
109 					 */
110 					*output_walk++ = *input_walk++;
111 					continue;
112 				}
113 				else {
114 					replace_walk = table->default_translation;
115 				}
116 			}
117 
118 			while (*replace_walk != '\0') {
119 				*output_walk++ = *replace_walk++;
120 			}
121 
122 			input_walk++;
123 		}
124 		else {
125 			*output_walk++ = *input_walk++;
126 		}
127 	}
128 
129 	*output_walk = '\0';
130 
131 	return output;
132 }
133 
134 
135 /*
136  * Cleans up any unsafe characters.
137  *
138  * The rules are:
139  *   Leave alone:
140  *     - # ~ % ^ _ , . + =
141  *
142  *   Translate:
143  *     &  into  _and_
144  *
145  *   Replace with _:
146  *     ` ! @ $ * \ | : ; " ' < ? / '\n' '\r' '\t'
147  *
148  *   Replace with -:
149  *     ( ) [ ] { }
150  *
151  */
clean_safe_basic(unsigned char * s,void * opts)152 unsigned char *clean_safe_basic(unsigned char *s, void *opts)
153 {
154 	unsigned char *output, *input_walk, *output_walk;
155 
156 	if (s == NULL) {
157 		return NULL;
158 	}
159 
160 	output = malloc((strlen(s) * 5) + 1);
161 	if (output == NULL) {
162 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
163 		return NULL;
164 	}
165 
166 	input_walk = s;
167 	output_walk = output;
168 
169 	while (*input_walk != '\0') {
170 		if (isalnum(*input_walk)) {
171 			*output_walk++ = *input_walk++;
172 			continue;
173 		}
174 
175 		switch (*input_walk) {
176 			case '-':
177 			case '#':
178 			case '~':
179 			case '%':
180 			case '^':
181 			case '_':
182 			case ',':
183 			case '.':
184 			case '+':
185 			case '=':
186 				*output_walk++ = *input_walk;
187 				break;
188 
189 			case '&':
190 				*output_walk++ = '_';
191 				*output_walk++ = 'a';
192 				*output_walk++ = 'n';
193 				*output_walk++ = 'd';
194 				*output_walk++ = '_';
195 				break;
196 
197 			case ' ':
198 			case '`':
199 			case '!':
200 			case '@':
201 			case '$':
202 			case '*':
203 			case '\\':
204 			case '|':
205 			case ':':
206 			case ';':
207 			case '"':
208 			case '\'':
209 			case '<':
210 			case '>':
211 			case '?':
212 			case '/':
213 			case '\n':
214 			case '\r':
215 			case '\t':
216 				*output_walk++ = '_';
217 				break;
218 
219 			case '(':
220 			case ')':
221 			case '[':
222 			case ']':
223 			case '{':
224 			case '}':
225 				*output_walk++ = '-';
226 				break;
227 		}
228 
229 		input_walk++;
230 	}
231 
232 	*output_walk = '\0';
233 
234 	return output;
235 }
236 
237 
238 /*
239  * Translates unsafe characters
240  */
clean_safe(unsigned char * s,void * opts)241 unsigned char *clean_safe(unsigned char *s, void *opts)
242 {
243 	unsigned char *output, *input_walk, *output_walk, *replace_walk;
244 
245 	struct translation_table *table = NULL;
246 	struct clean_string_options *options = NULL;
247 
248 	if (s == NULL) {
249 		return NULL;
250 	}
251 
252 	if (opts == NULL) {
253 		fprintf(stderr, "this shouldn't happen\n");
254 		exit(EXIT_FAILURE);
255 	}
256 
257 	options = (struct clean_string_options *)opts;
258 	table = options->translation_table;
259 
260 	output = malloc((strlen(s) * table->max_data_length) + 1);
261 	if (output == NULL) {
262 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
263 		return NULL;
264 	}
265 
266 	input_walk = s;
267 	output_walk = output;
268 
269 	while (*input_walk != '\0') {
270 		replace_walk = table_get(table, *input_walk);
271 		if (replace_walk == NULL) {
272 			if (table->default_translation == NULL) {
273 
274 				/*
275 				 * Null translation == leave it alone
276 				 */
277 				*output_walk++ = *input_walk++;
278 				continue;
279 			}
280 			else {
281 				replace_walk = table->default_translation;
282 			}
283 		}
284 
285 		while (*replace_walk != '\0') {
286 			*output_walk++ = *replace_walk++;
287 		}
288 
289 		input_walk++;
290 	}
291 
292 	*output_walk = '\0';
293 
294 	return output;
295 }
296 
297 
298 
299 /*
300  * Cleans up any CGI encoded characters, in the form "%" followed by 2 hex
301  * digits.
302  */
clean_uncgi(unsigned char * s,void * opts)303 unsigned char *clean_uncgi(unsigned char *s, void *opts)
304 {
305 	unsigned char *output, *input_walk, *output_walk;
306 	unsigned char conv[3];
307 
308 	if (s == NULL) {
309 		return NULL;
310 	}
311 
312 	output = malloc(strlen(s) + 1);
313 	if (output == NULL) {
314 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
315 		return NULL;
316 	}
317 
318 	input_walk = s;
319 	output_walk = output;
320 
321 	while (*input_walk != '\0') {
322 		if (input_walk[0] == '%' && isxdigit(input_walk[1]) && isxdigit(input_walk[2])) {
323 			conv[0] = input_walk[1];
324 			conv[1] = input_walk[2];
325 			conv[2] = 0;
326 			*output_walk++ = (unsigned char)strtol(conv, NULL, 16);
327 			input_walk += 3;
328 		}
329 		else {
330 			*output_walk++ = *input_walk++;
331 		}
332 	}
333 
334 	*output_walk = '\0';
335 
336 	return output;
337 }
338 
339 
340 /*
341  * Reduces any series of "_" and "-" to a single character.  "-" takes
342  * precedence.
343  *
344  * If "remove_trailing" is set to non-zero, then "." is added to the
345  * comparison, and takes precedence.  This has the effect of reducing "-." or
346  * "._", etc, to ".".
347  *
348  * Strips any "-", "_" or "#" from the beginning of a string.
349  *
350  */
clean_wipeup(unsigned char * s,void * opts)351 unsigned char *clean_wipeup(unsigned char *s, void *opts)
352 {
353 	unsigned char *output, *input_walk, *output_walk;
354 	int matched;
355 	int remove_trailing;
356 
357 	if (s == NULL) {
358 		return NULL;
359 	}
360 
361 	remove_trailing = 0;
362 	if (opts != NULL) {
363 		remove_trailing = ((struct clean_string_options *)opts)->remove_trailing;
364 	}
365 
366 	/* remove any -, _, or # at beginning of string */
367 	while (*s == '-' || *s == '_' || *s == '#') {
368 		s++;
369 	}
370 
371 	output = malloc(strlen(s) + 1);
372 	if (output == NULL) {
373 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
374 		return NULL;
375 	}
376 
377 	input_walk = s;
378 	output_walk = output;
379 	matched = 0;
380 
381 	while (*input_walk != '\0') {
382 		switch (*input_walk) {
383 			case '-':
384 				if (matched) {
385 					if (*output_walk == '_') {
386 						*output_walk = '-';
387 					}
388 				}
389 				else {
390 					*output_walk = '-';
391 				}
392 
393 				matched = 1;
394 				break;
395 
396 			case '_':
397 				if (!matched) {
398 					*output_walk = '_';
399 				}
400 
401 				matched = 1;
402 				break;
403 
404 			case '.':
405 				if (remove_trailing) {
406 					*output_walk = '.';
407 					matched = 1;
408 					break;
409 				}	/* else fall through */
410 			default:
411 				if (matched) {
412 					output_walk++;
413 					matched = 0;
414 				}
415 
416 				*output_walk++ = *input_walk;
417 		}
418 		input_walk++;
419 	}
420 
421 	if (matched) {
422 		output_walk++;
423 	}
424 
425 	*output_walk = '\0';
426 
427 	return output;
428 }
429 
430 #define UTF_8_ENCODED 0x80
431 #define UTF_8_ENCODED_4_CHARS 0xf0
432 #define UTF_8_ENCODED_3_CHARS 0xe0
433 #define UTF_8_ENCODED_2_CHARS 0xc0
434 
435 /*
436  * Translates UTF-8 characters (Unicode Translation Format - 8 Bit) into
437  * Unicode and then lower ASCII characters.
438  */
clean_utf_8_basic(unsigned char * s,void * opts)439 unsigned char *clean_utf_8_basic(unsigned char *s, void *opts)
440 {
441 	unsigned char *output, *input_walk, *output_walk, *replace_walk;
442 	int new_value, expected_chars;
443 
444 	if (s == NULL) {
445 		return NULL;
446 	}
447 
448 	output = malloc((strlen(s) * UNICODE_MAXLEN) + 1);
449 	if (output == NULL) {
450 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
451 		return NULL;
452 	}
453 
454 	input_walk = s;
455 	output_walk = output;
456 
457 	while (*input_walk != '\0') {
458 		if ((*input_walk & UTF_8_ENCODED) == 0) {
459 			*output_walk++ = *input_walk++;
460 			continue;
461 		}
462 
463 		new_value = 0;
464 		expected_chars = 0;
465 
466 		/*
467 		 * Needs to be done in descending orders due to the fact that
468 		 * the 2 char mask will match on the 4 char mask, but not
469 		 * vice versa.
470 		 */
471 		if ((*input_walk & UTF_8_ENCODED_4_CHARS) == UTF_8_ENCODED_4_CHARS) {
472 
473 			/*
474 			 * 11110aaa 10bbbbbb 10cccccc 10dddddd
475 			 */
476 
477 			new_value = *input_walk & 0x07;
478 			expected_chars = 3;
479 		}
480 		else if ((*input_walk & UTF_8_ENCODED_3_CHARS) == UTF_8_ENCODED_3_CHARS) {
481 
482 			/*
483 			 * 1110aaaa 10bbbbbb 10cccccc
484 			 */
485 
486 			new_value = *input_walk & 0x0f;
487 			expected_chars = 2;
488 		}
489 		else if ((*input_walk & UTF_8_ENCODED_2_CHARS) == UTF_8_ENCODED_2_CHARS) {
490 
491 			/*
492 			 * 110aaaaa 10bbbbbb
493 			 */
494 
495 			new_value = *input_walk & 0x1f;
496 			expected_chars = 1;
497 		}
498 		else {
499 			input_walk++;
500 			continue;
501 		}
502 
503 		while (expected_chars > 0) {
504 			new_value <<= 6;
505 
506 			input_walk++;
507 
508 			if (*input_walk == '\0') {
509 				new_value = -1;
510 				break;
511 			}
512 
513 			if ((*input_walk & UTF_8_ENCODED) == 0) {
514 				new_value = -1;
515 				break;
516 			}
517 
518 			new_value += *input_walk & 0x3f;
519 
520 			expected_chars--;
521 		}
522 
523 		if (new_value == -1) {
524 			continue;
525 		}
526 
527 		if (new_value >= UNICODE_COUNT) {
528 			*output_walk++ = '_';
529 			continue;
530 		}
531 
532 		replace_walk = (unsigned char *)&unicode_trans[new_value];
533 
534 		while (*replace_walk != '\0') {
535 			*output_walk++ = *replace_walk++;
536 		}
537 	}
538 
539 	*output_walk = '\0';
540 
541 	return output;
542 }
543 
544 /*
545  * Translates UTF-8 characters (Unicode Translation Format - 8 Bit) into
546  * Unicode and then runs the translation table.
547  */
clean_utf_8(unsigned char * s,void * opts)548 unsigned char *clean_utf_8(unsigned char *s, void *opts)
549 {
550 	unsigned char *output, *input_walk, *output_walk, *replace_walk;
551 	int new_value, expected_chars;
552 
553 	struct translation_table *table = NULL;
554 	struct clean_string_options *options = NULL;
555 
556 	int characters_eaten;
557 
558 	if (s == NULL) {
559 		return NULL;
560 	}
561 
562 	if (opts == NULL) {
563 		fprintf(stderr, "this shouldn't happen\n");
564 		exit(EXIT_FAILURE);
565 	}
566 
567 	options = (struct clean_string_options *)opts;
568 	table = options->translation_table;
569 
570 	output = malloc((strlen(s) * table->max_data_length) + 1);
571 	if (output == NULL) {
572 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
573 		return NULL;
574 	}
575 
576 	input_walk = s;
577 	output_walk = output;
578 
579 	while (*input_walk != '\0') {
580 		new_value = 0;
581 		expected_chars = 0;
582 		characters_eaten = 0;
583 
584 		/*
585 		 * Needs to be done in descending orders due to the fact that
586 		 * the 2 char mask will match on the 4 char mask, but not
587 		 * vice versa.
588 		 */
589 		if ((*input_walk & UTF_8_ENCODED_4_CHARS) == UTF_8_ENCODED_4_CHARS) {
590 
591 			/*
592 			 * 11110aaa 10bbbbbb 10cccccc 10dddddd
593 			 */
594 
595 			new_value = *input_walk & 0x07;
596 			expected_chars = 3;
597 			characters_eaten = 4;
598 		}
599 		else if ((*input_walk & UTF_8_ENCODED_3_CHARS) == UTF_8_ENCODED_3_CHARS) {
600 
601 			/*
602 			 * 1110aaaa 10bbbbbb 10cccccc
603 			 */
604 
605 			new_value = *input_walk & 0x0f;
606 			expected_chars = 2;
607 			characters_eaten = 3;
608 		}
609 		else if ((*input_walk & UTF_8_ENCODED_2_CHARS) == UTF_8_ENCODED_2_CHARS) {
610 
611 			/*
612 			 * 110aaaaa 10bbbbbb
613 			 */
614 
615 			new_value = *input_walk & 0x1f;
616 			expected_chars = 1;
617 			characters_eaten = 2;
618 		}
619 		else if ((*input_walk & UTF_8_ENCODED) == UTF_8_ENCODED) {
620 			fprintf(stderr, "unsupported unicode length\n");
621 			exit(EXIT_FAILURE);
622 		}
623 		else {
624 			new_value = *input_walk;
625 			expected_chars = 0;
626 			characters_eaten = 1;
627 		}
628 
629 		while (expected_chars > 0) {
630 			new_value <<= 6;
631 
632 			input_walk++;
633 
634 			if (*input_walk == '\0') {
635 				new_value = -1;
636 				break;
637 			}
638 
639 			if ((*input_walk & UTF_8_ENCODED) == 0) {
640 				new_value = -1;
641 				break;
642 			}
643 
644 			new_value += *input_walk & 0x3f;
645 
646 			expected_chars--;
647 		}
648 		input_walk++;
649 
650 		if (new_value == -1) {
651 			continue;
652 		}
653 
654 		replace_walk = table_get(table, new_value);
655 
656 		if (replace_walk == NULL) {
657 			replace_walk = table->default_translation;
658 		}
659 
660 		if (replace_walk == NULL) {
661 
662 			/*
663 			 * Null translation == leave it alone
664 			 */
665 			input_walk -= characters_eaten;
666 
667 			while (characters_eaten > 0) {
668 				*output_walk++ = *input_walk++;
669 				characters_eaten--;
670 			}
671 
672 			continue;
673 		}
674 
675 		while (*replace_walk != '\0') {
676 			*output_walk++ = *replace_walk++;
677 		}
678 	}
679 
680 	*output_walk = '\0';
681 
682 	return output;
683 }
684 
685 
686 
687 /*
688  * Trims a file down to specified length.
689  */
clean_max_length(unsigned char * s,void * opts)690 unsigned char *clean_max_length(unsigned char *s, void *opts)
691 {
692 	unsigned char *output, *input_walk, *output_walk;
693 	size_t max_length;
694 	size_t s_length;
695 	size_t ext_length;
696 
697 	if (s == NULL) {
698 		return NULL;
699 	}
700 
701 	max_length = 256;
702 	if (opts != NULL) {
703 		max_length = ((struct clean_string_options *)opts)->max_length;
704 	}
705 
706 	s_length = strlen(s);
707 
708 	output = malloc(max_length + 1);
709 	if (output == NULL) {
710 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
711 		return NULL;
712 	}
713 
714 	snprintf(output, max_length + 1, "%s", s);
715 
716 	if (s_length <= max_length) {
717 		return output;
718 	}
719 
720 	input_walk = strrchr(s, '.');
721 
722 	if (input_walk == NULL) {
723 		return output;
724 	}
725 
726 	ext_length = strlen(input_walk);
727 
728 	output_walk = output;
729 	output_walk += max_length - ext_length;
730 
731 	while (*(output_walk - 1) == '.' && output_walk > output) {
732 		output_walk--;
733 	}
734 
735 	snprintf(output_walk, ext_length + 1, "%s", input_walk);
736 
737 	return output;
738 }
739 
740 
741 /*
742  * Converts all characters to lowercase.
743  */
clean_lower(unsigned char * s,void * opts)744 unsigned char *clean_lower(unsigned char *s, void *opts)
745 {
746 	unsigned char *output, *input_walk, *output_walk;
747 
748 	if (s == NULL) {
749 		return NULL;
750 	}
751 
752 	output = malloc(strlen(s) + 1);
753 	if (output == NULL) {
754 		fprintf(stderr, "out of memory: %s\n", strerror(errno));
755 		return NULL;
756 	}
757 
758 	input_walk = s;
759 	output_walk = output;
760 
761 	while (*input_walk != '\0') {
762 		if (isupper(*input_walk)) {
763 			*output_walk++ = tolower(*input_walk++);
764 		}
765 		else {
766 			*output_walk++ = *input_walk++;
767 		}
768 	}
769 
770 	*output_walk = '\0';
771 
772 	return output;
773 }
774