1 /* $Id: decode.c,v 1.395 2011/09/03 13:25:39 sbajic Exp $ */
2 
3 /*
4  DSPAM
5  COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6 
7  This program is free software: you can redistribute it and/or modify
8  it under the terms of the GNU Affero General Public License as
9  published by the Free Software Foundation, either version 3 of the
10  License, or (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  GNU Affero General Public License for more details.
16 
17  You should have received a copy of the GNU Affero General Public License
18  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 
20 */
21 
22 /*
23  * decode.c - message decoding and parsing
24  *
25  *  DESCRIPTION
26  *    This set of functions performs parsing and decoding of a message and
27  *    embeds its components into a ds_message_t structure, suitable for
28  *    logical access.
29  */
30 
31 #ifdef HAVE_CONFIG_H
32 #include <auto-config.h>
33 #endif
34 
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include <ctype.h>
39 
40 #include "decode.h"
41 #include "error.h"
42 #include "util.h"
43 #include "language.h"
44 #include "buffer.h"
45 #include "base64.h"
46 #include "libdspam.h"
47 
48 /*
49  * _ds_actualize_message (const char *message)
50  *
51  * DESCRIPTION
52  *   primary message parser
53  *
54  *   this function performs all decoding and actualization of the message
55  *   into the message structures defined in the .h
56  *
57  * INPUT ARGUMENTS
58  *      message    message to decode
59  *
60  * RETURN VALUES
61  *   pointer to an allocated message structure (ds_message_t), NULL on failure
62  */
63 
64 ds_message_t
_ds_actualize_message(const char * message)65 _ds_actualize_message (const char *message)
66 {
67   char *line = NULL;
68   char *in = NULL;
69   char *m_in = NULL;
70   ds_message_part_t current_block;
71   ds_header_t current_heading = NULL;
72   struct nt *boundaries = NULL;
73   ds_message_t out = NULL;
74   int block_position = BP_HEADER;
75   int in_content = 0;
76 
77   if (!message || !(*message))
78     goto MEMFAIL;
79 
80   if (!(in = strdup(message)))
81     goto MEMFAIL;
82 
83   m_in = in;
84 
85   boundaries = nt_create (NT_CHAR);
86   if (!boundaries)
87     goto MEMFAIL;
88 
89   out = (ds_message_t) calloc (1, sizeof (struct _ds_message));
90   if (!out)
91     goto MEMFAIL;
92 
93   out->components = nt_create (NT_PTR);
94   if (!out->components)
95     goto MEMFAIL;
96 
97   current_block = _ds_create_message_part ();
98   if (!current_block)
99     goto MEMFAIL;
100 
101   if (nt_add (out->components, (void *) current_block) == NULL)
102     goto MEMFAIL;
103 
104   /* Read the message from memory */
105 
106   line = strsep (&in, "\n");
107   while (line)
108   {
109 
110     /* Header processing */
111 
112     if (block_position == BP_HEADER)
113     {
114 
115       /* If we see two boundaries converged on top of one another */
116 
117       if (_ds_match_boundary (boundaries, line))
118       {
119 
120         /* Add the boundary as the terminating boundary */
121 
122         current_block->terminating_boundary = strdup (line + 2);
123         current_block->original_encoding = current_block->encoding;
124 
125         _ds_decode_headers(current_block);
126         current_block = _ds_create_message_part ();
127 
128         if (!current_block)
129           goto MEMFAIL;
130 
131         if (nt_add (out->components, (void *) current_block) == NULL)
132           goto MEMFAIL;
133 
134         block_position = BP_HEADER;
135       }
136 
137       /* Concatenate multiline headers to the original header field data */
138 
139       else if (line[0] == 32 || line[0] == '\t')
140       {
141         if (current_heading)
142         {
143           char *eow, *ptr;
144 
145           ptr = realloc (current_heading->data,
146                          strlen (current_heading->data) + strlen (line) + 2);
147           if (ptr)
148           {
149             current_heading->data = ptr;
150             strcat (current_heading->data, "\n");
151             strcat (current_heading->data, line);
152           } else {
153             goto MEMFAIL;
154           }
155 
156           /* Our concatenated data doesn't have any whitespace between lines */
157           for(eow=line;eow[0] && isspace((int) eow[0]);eow++) { }
158 
159           ptr =
160             realloc (current_heading->concatenated_data,
161               strlen (current_heading->concatenated_data) + strlen (eow) + 1);
162           if (ptr)
163           {
164             current_heading->concatenated_data = ptr;
165             strcat (current_heading->concatenated_data, eow);
166           } else {
167             goto MEMFAIL;
168           }
169 
170           if (current_heading->original_data) {
171             ptr =
172               realloc (current_heading->original_data,
173                        strlen (current_heading->original_data) +
174                                strlen (line) + 2);
175             if (ptr) {
176               current_heading->original_data = ptr;
177               strcat (current_heading->original_data, "\n");
178               strcat (current_heading->original_data, line);
179             } else {
180               goto MEMFAIL;
181             }
182           }
183 
184           _ds_analyze_header (current_block, current_heading, boundaries);
185         }
186       }
187 
188       /* New header field when LF or CRLF is not found */
189 
190       else if (line[0] != 0  && line[0] != 13)
191       {
192         ds_header_t header = _ds_create_header_field (line);
193 
194         if (header != NULL)
195         {
196           _ds_analyze_header (current_block, header, boundaries);
197           current_heading = header;
198           nt_add (current_block->headers, header);
199         }
200 
201 
202       /* line[0] == 0 or line[0] == 13; LF or CRLF, switch to body */
203 
204       } else {
205         block_position = BP_BODY;
206       }
207     }
208 
209     /* Body processing */
210 
211     else if (block_position == BP_BODY)
212     {
213       /* Look for a boundary in the header of a part */
214 
215       if (!strncasecmp (line, "Content-Type", 12)
216             || ((line[0] == 32 || line[0] == 9) && in_content))
217       {
218         char boundary[128];
219         in_content = 1;
220         if (!_ds_extract_boundary(boundary, sizeof(boundary), line)) {
221           if (!_ds_match_boundary (boundaries, boundary)) {
222             _ds_push_boundary (boundaries, boundary);
223             free(current_block->boundary);
224             current_block->boundary = strdup (boundary);
225           }
226         } else {
227           _ds_push_boundary (boundaries, "");
228         }
229       } else {
230         in_content = 0;
231       }
232 
233       /* Multipart boundary was reached; move onto next block */
234 
235       if (_ds_match_boundary (boundaries, line))
236       {
237 
238         /* Add the boundary as the terminating boundary */
239 
240         current_block->terminating_boundary = strdup (line + 2);
241         current_block->original_encoding = current_block->encoding;
242 
243         _ds_decode_headers(current_block);
244         current_block = _ds_create_message_part ();
245 
246         if (!current_block)
247           goto MEMFAIL;
248 
249         if (nt_add (out->components, (void *) current_block) == NULL)
250           goto MEMFAIL;
251 
252         block_position = BP_HEADER;
253       }
254 
255       /* Plain old message (or part) body */
256 
257       else {
258         buffer_cat (current_block->body, line);
259 
260         /* Don't add extra \n at the end of message's body */
261 
262         if (in != NULL)
263           buffer_cat (current_block->body, "\n");
264       }
265     }
266 
267     line = strsep (&in, "\n");
268   } /* while (line) */
269 
270   _ds_decode_headers(current_block);
271 
272   free (m_in);
273   nt_destroy (boundaries);
274   return out;
275 
276 MEMFAIL:
277   if (m_in) free(m_in);
278   if (boundaries) nt_destroy (boundaries);
279   if (out) _ds_destroy_message(out);
280   LOG (LOG_CRIT, ERR_MEM_ALLOC);
281   return NULL;
282 }
283 
284 /*
285  * _ds_create_message_part
286  *
287  * DESCRIPTION
288  *   create and initialize a new message block component
289  *
290  * RETURN VALUES
291  *   pointer to an allocated message block (ds_message_part_t), NULL on failure
292  *
293  */
294 
295 ds_message_part_t
_ds_create_message_part(void)296 _ds_create_message_part (void)
297 {
298   ds_message_part_t block =
299     (ds_message_part_t) calloc (1, sizeof (struct _ds_message_part));
300 
301   if (!block)
302     goto MEMFAIL;
303 
304   block->headers = nt_create (NT_PTR);
305   if (!block->headers)
306     goto MEMFAIL;
307 
308   block->body = buffer_create (NULL);
309   if (!block->body)
310     goto MEMFAIL;
311 
312   block->encoding   = EN_UNKNOWN;
313   block->media_type = MT_TEXT;
314   block->media_subtype     = MST_PLAIN;
315   block->original_encoding = EN_UNKNOWN;
316   block->content_disposition = PCD_UNKNOWN;
317 
318   /* Not really necessary, but.. */
319 
320   block->boundary = NULL;
321   block->terminating_boundary = NULL;
322   block->original_signed_body = NULL;
323 
324 
325   return block;
326 
327 MEMFAIL:
328   if (block) {
329     buffer_destroy(block->body);
330     nt_destroy(block->headers);
331     free(block);
332   }
333   LOG (LOG_CRIT, ERR_MEM_ALLOC);
334   return NULL;
335 }
336 
337 /*
338  * _ds_create_header_field(const char *heading)
339  *
340  * DESCRIPTION
341  *   create and initialize a new header structure
342  *
343  * INPUT ARGUMENTS
344  *      heading    plain text heading (e.g. "To: Mom")
345  *
346  * RETURN VALUES
347  *   pointer to an allocated header structure (ds_header_t), NULL on failure
348  */
349 
350 ds_header_t
_ds_create_header_field(const char * heading)351 _ds_create_header_field (const char *heading)
352 {
353   char *in = strdup(heading);
354   char *ptr, *m = in, *data;
355   ds_header_t header =
356     (ds_header_t) calloc (1, sizeof (struct _ds_header_field));
357 
358   if (!header || !in)
359     goto MEMFAIL;
360 
361   ptr = strsep (&in, ":");
362   if (ptr) {
363     header->heading = strdup (ptr);
364     if (!header->heading)
365       goto MEMFAIL;
366     else
367     {
368       if (!in)
369       {
370         LOGDEBUG("%s:%u: unexpected data: header string '%s' doesn't "
371                  "contains `:' character", __FILE__, __LINE__, header->heading);
372 
373         /* Use empty string as data as fallback for comtinue processing. */
374 
375         in = "";
376       }
377       else
378       {
379         /* Skip white space */
380         while (*in == 32 || *in == 9)
381           ++in;
382       }
383 
384       data = strdup (in);
385       if (!data)
386         goto MEMFAIL;
387 
388       header->data = data;
389       header->concatenated_data = strdup(data);
390     }
391   }
392 
393   free (m);
394   return header;
395 
396 MEMFAIL:
397   free(header);
398   free(m);
399   LOG (LOG_CRIT, ERR_MEM_ALLOC);
400   return NULL;
401 }
402 
403 /*
404  * _ds_decode_headers (ds_message_part_t block)
405  *
406  * DESCRIPTION
407  *   decodes in-line encoded headers
408  *
409  * RETURN VALUES
410  *   returns 0 on success
411  */
412 
413 int
_ds_decode_headers(ds_message_part_t block)414 _ds_decode_headers (ds_message_part_t block) {
415 #ifdef VERBOSE
416   LOGDEBUG("decoding headers in message block");
417 #endif
418   char *ptr, *dptr, *rest, *enc;
419   ds_header_t header;
420   struct nt_node *node_nt;
421   struct nt_c c_nt;
422   long decoded_len;
423 
424   node_nt = c_nt_first(block->headers, &c_nt);
425   while(node_nt != NULL) {
426     long enc_offset;
427     header = (ds_header_t) node_nt->ptr;
428 
429     for(enc_offset = 0; header->concatenated_data[enc_offset]; enc_offset++)
430     {
431       enc = header->concatenated_data + enc_offset;
432 
433       if (!strncmp(enc, "=?", 2)) {
434         int was_null = 0;
435         char *ptrptr, *decoded = NULL;
436         long offset = (long) enc - (long) header->concatenated_data;
437 
438         if (header->original_data == NULL) {
439           header->original_data = strdup(header->data);
440           was_null = 1;
441         }
442 
443         strtok_r (enc, "?", &ptrptr);
444         strtok_r (NULL, "?", &ptrptr);
445         ptr = strtok_r (NULL, "?", &ptrptr);
446         dptr = strtok_r (NULL, "?", &ptrptr);
447         if (!dptr) {
448           if (was_null && header->original_data != NULL)
449             free(header->original_data);
450           if (was_null)
451             header->original_data = NULL;
452           continue;
453         }
454 
455         rest = dptr + strlen (dptr);
456         if (rest[0]!=0) {
457           rest++;
458           if (rest[0]!=0) rest++;
459         }
460 
461         if (ptr != NULL && (ptr[0] == 'b' || ptr[0] == 'B'))
462           decoded = _ds_decode_base64 (dptr);
463         else if (ptr != NULL && (ptr[0] == 'q' || ptr[0] == 'Q'))
464           decoded = _ds_decode_quoted (dptr);
465 
466         decoded_len = 0;
467 
468         /* Append the rest of the message */
469 
470         if (decoded)
471         {
472           char *new_alloc;
473 
474           decoded_len = strlen(decoded);
475           new_alloc = calloc (1, offset + decoded_len + strlen (rest) + 2);
476           if (new_alloc == NULL) {
477             LOG (LOG_CRIT, ERR_MEM_ALLOC);
478           }
479           else
480           {
481             if (offset)
482               strncpy(new_alloc, header->concatenated_data, offset);
483 
484             strcat(new_alloc, decoded);
485             strcat(new_alloc, rest);
486             free(decoded);
487             decoded = new_alloc;
488           }
489         }
490 
491         if (decoded) {
492           enc_offset += (decoded_len-1);
493           free(header->concatenated_data);
494           header->concatenated_data = decoded;
495         }
496         else if (was_null && header->original_data) {
497           free(header->original_data);
498           header->original_data = NULL;
499         }
500         else if (was_null) {
501           header->original_data = NULL;
502         }
503       }
504     }
505 
506     if (header->original_data != NULL) {
507       free(header->data);
508       header->data = strdup(header->concatenated_data);
509     }
510 
511     node_nt = c_nt_next(block->headers, &c_nt);
512   }
513 
514   return 0;
515 }
516 
517 /*
518  *  _ds_analyze_header (ds_message_part_t block, ds_header_t header,
519  *                      struct nt *boundaries)
520  *
521  * DESCRIPTION
522  *   analyzes the header passed in and performs various operations including:
523  *     - setting media type and subtype
524  *     - setting transfer encoding
525  *     - adding newly discovered boundaries
526  *
527  *   based on the heading specified. essentially all headers should be
528  *   analyzed for future expansion
529  *
530  * INPUT ARGUMENTS
531  *      block		the message block to which the header belongs
532  *      header		the header to analyze
533  *      boundaries	a list of known boundaries found within the block
534  */
535 
536 void
_ds_analyze_header(ds_message_part_t block,ds_header_t header,struct nt * boundaries)537 _ds_analyze_header (
538   ds_message_part_t block,
539   ds_header_t header,
540   struct nt *boundaries)
541 {
542   if (!header || !block || !header->data)
543     return;
544 
545   /* Content-Type header */
546 
547   if (!strcasecmp (header->heading, "Content-Type"))
548   {
549     int len = strlen(header->data);
550     if (!strncasecmp (header->data, "text", 4)) {
551       block->media_type = MT_TEXT;
552       if (len >= 5 && !strncasecmp (header->data + 5, "plain", 5))
553         block->media_subtype = MST_PLAIN;
554       else if (len >= 5 && !strncasecmp (header->data + 5, "html", 4))
555         block->media_subtype = MST_HTML;
556       else
557         block->media_subtype = MST_OTHER;
558     }
559 
560     else if (!strncasecmp (header->data, "application", 11))
561     {
562       block->media_type = MT_APPLICATION;
563       if (len >= 12 && !strncasecmp (header->data + 12, "dspam-signature", 15))
564         block->media_subtype = MST_DSPAM_SIGNATURE;
565       else
566         block->media_subtype = MST_OTHER;
567     }
568 
569     else if (!strncasecmp (header->data, "message", 7))
570     {
571       block->media_type = MT_MESSAGE;
572       if (len >= 8 && !strncasecmp (header->data + 8, "rfc822", 6))
573         block->media_subtype = MST_RFC822;
574       else if (len >= 8 && !strncasecmp (header->data + 8, "inoculation", 11))
575         block->media_subtype = MST_INOCULATION;
576       else
577         block->media_subtype = MST_OTHER;
578     }
579 
580     else if (!strncasecmp (header->data, "multipart", 9))
581     {
582       char boundary[128];
583 
584       block->media_type = MT_MULTIPART;
585       if (len >= 10 && !strncasecmp (header->data + 10, "mixed", 5))
586         block->media_subtype = MST_MIXED;
587       else if (len >= 10 && !strncasecmp (header->data + 10, "alternative", 11))
588         block->media_subtype = MST_ALTERNATIVE;
589       else if (len >= 10 && !strncasecmp (header->data + 10, "signed", 6))
590         block->media_subtype = MST_SIGNED;
591       else if (len >= 10 && !strncasecmp (header->data + 10, "encrypted", 9))
592         block->media_subtype = MST_ENCRYPTED;
593       else
594         block->media_subtype = MST_OTHER;
595 
596       if (!_ds_extract_boundary(boundary, sizeof(boundary), header->data)) {
597         if (!_ds_match_boundary (boundaries, boundary)) {
598           _ds_push_boundary (boundaries, boundary);
599           free(block->boundary);
600           block->boundary = strdup (boundary);
601         }
602       } else {
603         _ds_push_boundary (boundaries, "");
604       }
605     }
606     else {
607       block->media_type = MT_OTHER;
608       block->media_subtype = MST_OTHER;
609     }
610 
611   }
612 
613   /* Content-Transfer-Encoding */
614 
615   else if (!strcasecmp (header->heading, "Content-Transfer-Encoding"))
616   {
617     if (!strncasecmp (header->data, "7bit", 4))
618       block->encoding = EN_7BIT;
619     else if (!strncasecmp (header->data, "8bit", 4))
620       block->encoding = EN_8BIT;
621     else if (!strncasecmp (header->data, "quoted-printable", 16))
622       block->encoding = EN_QUOTED_PRINTABLE;
623     else if (!strncasecmp (header->data, "base64", 6))
624       block->encoding = EN_BASE64;
625     else if (!strncasecmp (header->data, "binary", 6))
626       block->encoding = EN_BINARY;
627     else
628       block->encoding = EN_OTHER;
629   }
630 
631   if (!strcasecmp (header->heading, "Content-Disposition"))
632   {
633     if (!strncasecmp (header->data, "inline", 6))
634       block->content_disposition = PCD_INLINE;
635     else if (!strncasecmp (header->data, "attachment", 10))
636       block->content_disposition = PCD_ATTACHMENT;
637     else
638       block->content_disposition = PCD_OTHER;
639   }
640 
641   return;
642 }
643 
644 /*
645  * _ds_destroy_message (ds_message_t message)
646  *
647  * DESCRIPTION
648  *   destroys a message structure (ds_message_t)
649  *
650  * INPUT ARGUMENTS
651  *      message    the message structure to be destroyed
652  */
653 
654 void
_ds_destroy_message(ds_message_t message)655 _ds_destroy_message (ds_message_t message)
656 {
657   struct nt_node *node_nt;
658   struct nt_c c;
659 
660   if (message == NULL)
661     return;
662 
663   if (message->components) {
664     node_nt = c_nt_first (message->components, &c);
665     while (node_nt != NULL)
666     {
667       ds_message_part_t block = (ds_message_part_t) node_nt->ptr;
668       _ds_destroy_block(block);
669       node_nt = c_nt_next (message->components, &c);
670     }
671     nt_destroy (message->components);
672   }
673   free (message);
674   return;
675 }
676 
677 /*
678  * _ds_destroy_headers (ds_message_part_t block)
679  *
680  * DESCRIPTION
681  *   destroys a message block's header pairs
682  *   does not free the structures themselves; these are freed at nt_destroy
683  *
684  * INPUT ARGUMENTS
685  *      block    the message block containing the headers to destsroy
686  */
687 
688 void
_ds_destroy_headers(ds_message_part_t block)689 _ds_destroy_headers (ds_message_part_t block)
690 {
691   struct nt_node *node_nt;
692   struct nt_c c;
693 
694   if (!block || !block->headers)
695     return;
696 
697   node_nt = c_nt_first (block->headers, &c);
698   while (node_nt != NULL)
699   {
700     ds_header_t field = (ds_header_t) node_nt->ptr;
701 
702     if (field)
703     {
704       free (field->original_data);
705       free (field->heading);
706       free (field->concatenated_data);
707       free (field->data);
708     }
709     node_nt = c_nt_next (block->headers, &c);
710   }
711 
712   return;
713 }
714 
715 /*
716  * _ds_destroy_block (ds_message_part_t block)
717  *
718  * DESCRIPTION
719  *   destroys a message block
720  *
721  * INPUT ARGUMENTS
722  *   block   the message block to destroy
723  */
724 
725 void
_ds_destroy_block(ds_message_part_t block)726 _ds_destroy_block (ds_message_part_t block)
727 {
728   if (!block)
729     return;
730 
731   if (block->headers)
732   {
733     _ds_destroy_headers (block);
734     nt_destroy (block->headers);
735   }
736   buffer_destroy (block->body);
737   buffer_destroy (block->original_signed_body);
738   free (block->boundary);
739   free (block->terminating_boundary);
740 //  free (block);
741   return;
742 }
743 
744 /*
745  * _ds_decode_block (ds_message_part_t block)
746  *
747  * DESCRIPTION
748  *   decodes a message block
749  *
750  * INPUT ARGUMENTS
751  *   block   the message block to decode
752  *
753  * RETURN VALUES
754  *   a pointer to the allocated character array containing the decoded message
755  *   NULL on failure
756  */
757 
758 char *
_ds_decode_block(ds_message_part_t block)759 _ds_decode_block (ds_message_part_t block)
760 {
761   if (block->encoding == EN_BASE64)
762     return _ds_decode_base64 (block->body->data);
763   else if (block->encoding == EN_QUOTED_PRINTABLE)
764     return _ds_decode_quoted (block->body->data);
765 
766   LOG (LOG_WARNING, "decoding of block encoding type %d not supported",
767        block->encoding);
768   return NULL;
769 }
770 
771 /*
772  * _ds_decode_{base64,quoted,hex8bit}
773  *
774  * DESCRIPTION
775  *   supporting block decoder functions
776  *   these function call (or perform) specific decoding functions
777  *
778  * INPUT ARGUMENTS
779  *   body	encoded message body
780  *
781  * RETURN VALUES
782  *   a pointer to the allocated character array containing the decoded body
783  */
784 
785 char *
_ds_decode_base64(const char * body)786 _ds_decode_base64 (const char *body)
787 {
788   if (body == NULL)
789     return NULL;
790 
791   return base64decode (body);
792 }
793 
794 char *
_ds_decode_quoted(const char * body)795 _ds_decode_quoted (const char *body)
796 {
797 #ifdef VERBOSE
798   LOGDEBUG("decoding Quoted Printable encoded buffer");
799 #endif
800   if (!body)
801     return NULL;
802 
803   char *n, *out;
804   const char *end, *p;
805 
806   n = out = malloc(strlen(body)+1);
807   end = body + strlen(body);
808 
809   if (out == NULL) {
810     LOG (LOG_CRIT, ERR_MEM_ALLOC);
811     return NULL;
812   }
813 
814   for (p = body; p < end; p++, n++) {
815     if (*p == '=') {
816       if (p[1] == '\r' && p[2] == '\n') {
817         n -= 1;
818         p += 2;
819       } else if (p[1] == '\n') {
820         n -= 1;
821         p += 1;
822       } else if (p[1] && p[2] && isxdigit((unsigned char) p[1]) && isxdigit((unsigned char) p[2])) {
823         *n = ((_ds_hex2dec((unsigned char) p[1])) << 4) | (_ds_hex2dec((unsigned char) p[2]));
824         p += 2;
825       } else
826         *n = *p;
827     } else
828       *n = *p;
829   }
830 
831   *n = '\0';
832   return (char *)out;
833 }
834 
835 char *
_ds_decode_hex8bit(const char * body)836 _ds_decode_hex8bit (const char *body)
837 {
838 #ifdef VERBOSE
839   LOGDEBUG("decoding hexadecimal 8-bit encodings in message block");
840 #endif
841   if (!body)
842     return NULL;
843 
844   char *n, *out;
845   const char *end, *p;
846 
847   n = out = malloc(strlen(body)+1);
848   end = body + strlen(body);
849 
850   if (out == NULL) {
851     LOG (LOG_CRIT, ERR_MEM_ALLOC);
852     return NULL;
853   }
854 
855   for (p = body; p < end; p++, n++) {
856     if (*p == '%')
857       if (p[1] && p[2] && isxdigit((unsigned char) p[1]) && isxdigit((unsigned char) p[2])) {
858         *n = ((_ds_hex2dec((unsigned char) p[1])) << 4) | (_ds_hex2dec((unsigned char) p[2]));
859         p += 2;
860       } else
861         *n = *p;
862     else
863       *n = *p;
864   }
865 
866   *n = '\0';
867   return (char *)out;
868 }
869 
870 /*
871  * _ds_encode_block (ds_message_part_t block, int encoding)
872  *
873  * DESCRIPTION
874  *   encodes a message block using the encoding specified and replaces the
875  *   block's message body with the encoded data
876  *
877  * INPUT ARGUMENTS
878  *      block       the message block to encode
879  *      encoding    encoding to use (EN_)
880  *
881  * RETURN VALUES
882  *    returns 0 on success
883  */
884 
885 int
_ds_encode_block(ds_message_part_t block,int encoding)886 _ds_encode_block (ds_message_part_t block, int encoding)
887 {
888   /* we can't encode a block with the same encoding */
889 
890   if (block->encoding == encoding)
891     return EINVAL;
892 
893   /* we can't encode a block that's already encoded */
894 
895   if (block->encoding == EN_BASE64 || block->encoding == EN_QUOTED_PRINTABLE)
896     return EFAILURE;
897 
898   if (encoding == EN_BASE64) {
899     char *encoded = _ds_encode_base64 (block->body->data);
900     buffer_destroy (block->body);
901     block->body = buffer_create (encoded);
902     free (encoded);
903     block->encoding = EN_BASE64;
904   }
905   else if (encoding == EN_QUOTED_PRINTABLE) {
906 
907     /* TODO */
908 
909     return 0;
910   }
911 
912   LOGDEBUG("unsupported encoding: %d", encoding);
913   return 0;
914 }
915 
916 /*
917  * _ds_encode_{base64,quoted}
918  *
919  * DESCRIPTION
920  *   supporting block encoder functions
921  *   these function call (or perform) specific encoding functions
922  *
923  * INPUT ARGUMENTS
924  *   body        decoded message body
925  *
926  * RETURN VALUES
927  *   a pointer to the allocated character array containing the encoded body
928  */
929 
930 char *
_ds_encode_base64(const char * body)931 _ds_encode_base64 (const char *body)
932 {
933   return base64encode (body);
934 }
935 
936 /*
937  * _ds_assemble_message (ds_message_t message)
938  *
939  * DESCRIPTION
940  *   assembles a message structure into a flat text message
941  *
942  * INPUT ARGUMENTS
943  *      message    the message structure (ds_message_t) to assemble
944  *
945  * RETURN VALUES
946  *   a pointer to the allocated character array containing the text message
947  */
948 
949 char *
_ds_assemble_message(ds_message_t message,const char * newline)950 _ds_assemble_message (ds_message_t message, const char *newline)
951 {
952   buffer *out = buffer_create (NULL);
953   struct nt_node *node_nt, *node_header;
954   struct nt_c c_nt, c_nt2;
955   char *heading;
956   char *copyback;
957 #ifdef VERBOSE
958   int i = 0;
959 #endif
960 
961   if (!out) {
962     LOG (LOG_CRIT, ERR_MEM_ALLOC);
963     return NULL;
964   }
965 
966   node_nt = c_nt_first (message->components, &c_nt);
967   while (node_nt != NULL && node_nt->ptr != NULL)
968   {
969     ds_message_part_t block =
970       (ds_message_part_t) node_nt->ptr;
971 #ifdef VERBOSE
972     LOGDEBUG ("assembling component %d", i);
973 #endif
974 
975     /* Assemble headers */
976 
977     if (block->headers != NULL && block->headers->items > 0)
978     {
979       node_header = c_nt_first (block->headers, &c_nt2);
980       while (node_header != NULL)
981       {
982         char *data;
983         ds_header_t current_header =
984           (ds_header_t) node_header->ptr;
985 
986         data = (current_header->original_data == NULL) ? current_header->data :
987                current_header->original_data;
988 
989         heading = malloc(
990             ((current_header->heading) ? strlen(current_header->heading) : 0)
991           + ((data) ? strlen(data) : 0)
992           + 3 + strlen(newline));
993 
994         if (current_header->heading != NULL &&
995             (!strncmp (current_header->heading, "From ", 5) ||
996              !strncmp (current_header->heading, "--", 2)))
997           sprintf (heading, "%s:%s%s",
998             (current_header->heading) ? current_header->heading : "",
999             (data) ? data : "", newline);
1000         else
1001           sprintf (heading, "%s: %s%s",
1002             (current_header->heading) ? current_header->heading : "",
1003             (data) ? data : "", newline);
1004 
1005         buffer_cat (out, heading);
1006         free(heading);
1007         node_header = c_nt_next (block->headers, &c_nt2);
1008       }
1009     }
1010 
1011     buffer_cat (out, newline);
1012 
1013     /* Assemble bodies */
1014 
1015     if (block->original_signed_body != NULL && message->protect)
1016       buffer_cat (out, block->original_signed_body->data);
1017     else
1018       buffer_cat (out, block->body->data);
1019 
1020     if (block->terminating_boundary != NULL)
1021     {
1022       buffer_cat (out, "--");
1023       buffer_cat (out, block->terminating_boundary);
1024     }
1025 
1026     node_nt = c_nt_next (message->components, &c_nt);
1027 #ifdef VERBOSE
1028     i++;
1029 #endif
1030 
1031     if (node_nt != NULL && node_nt->ptr != NULL)
1032       buffer_cat (out, newline);
1033   }
1034 
1035   copyback = out->data;
1036   out->data = NULL;
1037   buffer_destroy (out);
1038   return copyback;
1039 }
1040 
1041 /*
1042  * _ds_{push,pop,match,extract}_boundary
1043  *
1044  * DESCRIPTION
1045  *   these functions maintain and service a boundary "stack" on the message
1046  */
1047 
1048 int
_ds_push_boundary(struct nt * stack,const char * boundary)1049 _ds_push_boundary (struct nt *stack, const char *boundary)
1050 {
1051   char *y;
1052 
1053   if (boundary == NULL || boundary[0] == 0)
1054     return EINVAL;
1055 
1056   y = malloc (strlen (boundary) + 3);
1057   if (y == NULL)
1058     return EUNKNOWN;
1059 
1060   sprintf (y, "--%s", boundary);
1061   nt_add (stack, (char *) y);
1062   free(y);
1063 
1064   return 0;
1065 }
1066 
1067 char *
_ds_pop_boundary(struct nt * stack)1068 _ds_pop_boundary (struct nt *stack)
1069 {
1070   struct nt_node *node, *last_node = NULL, *parent_node = NULL;
1071   struct nt_c c;
1072   char *boundary = NULL;
1073 
1074   node = c_nt_first (stack, &c);
1075   while (node != NULL)
1076   {
1077     parent_node = last_node;
1078     last_node = node;
1079     node = c_nt_next (stack, &c);
1080   }
1081   if (parent_node != NULL)
1082     parent_node->next = NULL;
1083   else
1084     stack->first = NULL;
1085 
1086   if (last_node == NULL)
1087     return NULL;
1088 
1089   boundary = strdup (last_node->ptr);
1090 
1091   free (last_node->ptr);
1092   free (last_node);
1093 
1094   return boundary;
1095 }
1096 
1097 int
_ds_match_boundary(struct nt * stack,const char * buff)1098 _ds_match_boundary (struct nt *stack, const char *buff)
1099 {
1100   struct nt_node *node;
1101   struct nt_c c;
1102 
1103   node = c_nt_first (stack, &c);
1104   while (node != NULL)
1105   {
1106     if (!strncmp (buff, node->ptr, strlen (node->ptr)))
1107     {
1108       return 1;
1109     }
1110     node = c_nt_next (stack, &c);
1111   }
1112   return 0;
1113 }
1114 
1115 int
_ds_extract_boundary(char * buf,size_t size,char * mem)1116 _ds_extract_boundary (char *buf, size_t size, char *mem)
1117 {
1118   char *data, *ptr, *ptrptr;
1119 
1120   if (mem == NULL)
1121     return EINVAL;
1122 
1123   data = strdup(mem);
1124   if (data == NULL) {
1125     LOG(LOG_CRIT, ERR_MEM_ALLOC);
1126     return EUNKNOWN;
1127   }
1128 
1129   for(ptr=data;ptr<(data+strlen(data));ptr++) {
1130     if (!strncasecmp(ptr, "boundary", 8)) {
1131       ptr = strchr(ptr, '=');
1132       if (ptr == NULL) {
1133         free(data);
1134         return EFAILURE;
1135       }
1136       ptr++;
1137       while(isspace((int) ptr[0]))
1138         ptr++;
1139       if (ptr[0] == '"')
1140         ptr++;
1141       strtok_r(ptr, " \";\n\t", &ptrptr);
1142       strlcpy(buf, ptr, size);
1143       free(data);
1144       return 0;
1145     }
1146   }
1147 
1148   free(data);
1149   return EFAILURE;
1150 }
1151 
1152 /*
1153  * _ds_find_header (ds_message_t message, consr char *heading) {
1154  *
1155  * DESCRIPTION
1156  *   finds a header and returns its value
1157  *
1158  * INPUT ARGUMENTS
1159  *   message     the message structure to search
1160  *   heading	the heading to search for
1161  *   flags	optional search flags
1162  *
1163  * RETURN VALUES
1164  *   a pointer to the header structure's value
1165  *
1166  */
1167 
1168 char *
_ds_find_header(ds_message_t message,const char * heading)1169 _ds_find_header (ds_message_t message, const char *heading) {
1170   ds_message_part_t block;
1171   ds_header_t head;
1172   struct nt_node *node_nt;
1173 
1174   if (message->components->first) {
1175     if ((block = message->components->first->ptr)==NULL)
1176       return NULL;
1177     if (block->headers == NULL)
1178       return NULL;
1179   } else {
1180     return NULL;
1181   }
1182 
1183   node_nt = block->headers->first;
1184   while(node_nt != NULL) {
1185     head = (ds_header_t) node_nt->ptr;
1186     if (head && !strcasecmp(head->heading, heading)) {
1187       return head->data;
1188     }
1189     node_nt = node_nt->next;
1190   }
1191 
1192   return NULL;
1193 }
1194 
_ds_hex2dec(unsigned char hex)1195 int _ds_hex2dec(unsigned char hex) {
1196   switch (hex) {
1197     case '0': return 0;
1198     case '1': return 1;
1199     case '2': return 2;
1200     case '3': return 3;
1201     case '4': return 4;
1202     case '5': return 5;
1203     case '6': return 6;
1204     case '7': return 7;
1205     case '8': return 8;
1206     case '9': return 9;
1207     case 'a': case 'A': return 10;
1208     case 'b': case 'B': return 11;
1209     case 'c': case 'C': return 12;
1210     case 'd': case 'D': return 13;
1211     case 'e': case 'E': return 14;
1212     case 'f': case 'F': return 15;
1213     default: return -1;
1214   }
1215 }
1216 
1217 /*
1218  * _ds_strip_html(const char *html)
1219  *
1220  * DESCRIPTION
1221  *    strip html tags from the supplied message
1222  *
1223  * INPUT ARGUMENTS
1224  *     html encoded message body
1225  *
1226  * RETURN VALUES
1227  *   a pointer to the allocated character array containing the
1228  *   stripped message
1229  *
1230  */
1231 
1232 char *
_ds_strip_html(const char * html)1233 _ds_strip_html (const char *html)
1234 {
1235 #ifdef VERBOSE
1236   LOGDEBUG("stripping HTML tags from message block");
1237 #endif
1238   size_t j = 0, k = 0, i = 0;
1239   int visible = 1;
1240   int closing_td_tag = 0;
1241   char *html2;
1242   const char *cdata_close_tag = NULL;
1243 
1244   if(!html)
1245     return NULL;
1246 
1247   static struct {
1248     unsigned int id;
1249     char *entity;
1250   }
1251   charset[] = {
1252     {   32, "&nbsp;"    }, {  34, "&quot;"    }, {  34, "&quot;"    }, {  38, "&amp;"     },
1253     {   38, "&amp;"     }, {  39, "&apos;"    }, {  60, "&lt;"      }, {  60, "&lt;"      },
1254     {   62, "&gt;"      }, {  62, "&gt;"      }, { 160, "&nbsp;"    }, { 161, "&iexcl;"   },
1255     {  162, "&cent;"    }, { 163, "&pound;"   }, { 164, "&curren;"  }, { 165, "&yen;"     },
1256     {  166, "&brvbar;"  }, { 167, "&sect;"    }, { 168, "&uml;"     }, { 169, "&copy;"    },
1257     {  170, "&ordf;"    }, { 171, "&laquo;"   }, { 172, "&not;"     }, { 173, "&shy;"     },
1258     {  174, "&reg;"     }, { 175, "&macr;"    }, { 176, "&deg;"     }, { 177, "&plusmn;"  },
1259     {  178, "&sup2;"    }, { 179, "&sup3;"    }, { 180, "&acute;"   }, { 181, "&micro;"   },
1260     {  182, "&para;"    }, { 183, "&middot;"  }, { 184, "&cedil;"   }, { 185, "&sup1;"    },
1261     {  186, "&ordm;"    }, { 187, "&raquo;"   }, { 188, "&frac14;"  }, { 189, "&frac12;"  },
1262     {  190, "&frac34;"  }, { 191, "&iquest;"  }, { 192, "&Agrave;"  }, { 193, "&Aacute;"  },
1263     {  194, "&Acirc;"   }, { 195, "&Atilde;"  }, { 196, "&Auml;"    }, { 197, "&Aring;"   },
1264     {  198, "&AElig;"   }, { 199, "&Ccedil;"  }, { 200, "&Egrave;"  }, { 201, "&Eacute;"  },
1265     {  202, "&Ecirc;"   }, { 203, "&Euml;"    }, { 204, "&Igrave;"  }, { 205, "&Iacute;"  },
1266     {  206, "&Icirc;"   }, { 207, "&Iuml;"    }, { 208, "&ETH;"     }, { 209, "&Ntilde;"  },
1267     {  210, "&Ograve;"  }, { 211, "&Oacute;"  }, { 212, "&Ocirc;"   }, { 213, "&Otilde;"  },
1268     {  214, "&Ouml;"    }, { 215, "&times;"   }, { 216, "&Oslash;"  }, { 217, "&Ugrave;"  },
1269     {  218, "&Uacute;"  }, { 219, "&Ucirc;"   }, { 220, "&Uuml;"    }, { 221, "&Yacute;"  },
1270     {  222, "&THORN;"   }, { 223, "&szlig;"   }, { 224, "&agrave;"  }, { 225, "&aacute;"  },
1271     {  226, "&acirc;"   }, { 227, "&atilde;"  }, { 228, "&auml;"    }, { 229, "&aring;"   },
1272     {  230, "&aelig;"   }, { 231, "&ccedil;"  }, { 232, "&egrave;"  }, { 233, "&eacute;"  },
1273     {  234, "&ecirc;"   }, { 235, "&euml;"    }, { 236, "&igrave;"  }, { 237, "&iacute;"  },
1274     {  238, "&icirc;"   }, { 239, "&iuml;"    }, { 240, "&eth;"     }, { 241, "&ntilde;"  },
1275     {  242, "&ograve;"  }, { 243, "&oacute;"  }, { 244, "&ocirc;"   }, { 245, "&otilde;"  },
1276     {  246, "&ouml;"    }, { 247, "&divide;"  }, { 248, "&oslash;"  }, { 249, "&ugrave;"  },
1277     {  250, "&uacute;"  }, { 251, "&ucirc;"   }, { 252, "&uuml;"    }, { 253, "&yacute;"  },
1278     {  254, "&thorn;"   }, { 255, "&yuml;"    }, { 338, "&OElig;"   }, { 339, "&oelig;"   },
1279     {  352, "&Scaron;"  }, { 353, "&scaron;"  }, { 376, "&Yuml;"    }, { 402, "&fnof;"    },
1280     {  710, "&circ;"    }, { 732, "&tilde;"   }, { 913, "&Alpha;"   }, { 914, "&Beta;"    },
1281     {  915, "&Gamma;"   }, { 916, "&Delta;"   }, { 917, "&Epsilon;" }, { 918, "&Zeta;"    },
1282     {  919, "&Eta;"     }, { 920, "&Theta;"   }, { 921, "&Iota;"    }, { 922, "&Kappa;"   },
1283     {  923, "&Lambda;"  }, { 924, "&Mu;"      }, { 925, "&Nu;"      }, { 926, "&Xi;"      },
1284     {  927, "&Omicron;" }, { 928, "&Pi;"      }, { 929, "&Rho;"     }, { 931, "&Sigma;"   },
1285     {  932, "&Tau;"     }, { 933, "&Upsilon;" }, { 934, "&Phi;"     }, { 935, "&Chi;"     },
1286     {  936, "&Psi;"     }, { 937, "&Omega;"   }, { 945, "&alpha;"   }, { 946, "&beta;"    },
1287     {  947, "&gamma;"   }, { 948, "&delta;"   }, { 949, "&epsilon;" }, { 950, "&zeta;"    },
1288     {  951, "&eta;"     }, { 952, "&theta;"   }, { 953, "&iota;"    }, { 954, "&kappa;"   },
1289     {  955, "&lambda;"  }, { 956, "&mu;"      }, { 957, "&nu;"      }, { 958, "&xi;"      },
1290     {  959, "&omicron;" }, { 960, "&pi;"      }, { 961, "&rho;"     }, { 962, "&sigmaf;"  },
1291     {  963, "&sigma;"   }, { 964, "&tau;"     }, { 965, "&upsilon;" }, { 966, "&phi;"     },
1292     {  967, "&chi;"     }, { 968, "&psi;"     }, { 969, "&omega;"   }, { 977, "&thetasym" },
1293     {  978, "&upsih;"   }, { 982, "&piv;"     }, {8194, "&ensp;"    }, {8195, "&emsp;"    },
1294     { 8201, "&thinsp;"  }, {8204, "&zwnj;"    }, {8205, "&zwj;"     }, {8206, "&lrm;"     },
1295     { 8207, "&rlm;"     }, {8211, "&ndash;"   }, {8212, "&mdash;"   }, {8216, "&lsquo;"   },
1296     { 8217, "&rsquo;"   }, {8218, "&sbquo;"   }, {8220, "&ldquo;"   }, {8221, "&rdquo;"   },
1297     { 8222, "&bdquo;"   }, {8224, "&dagger;"  }, {8225, "&Dagger;"  }, {8226, "&bull;"    },
1298     { 8230, "&hellip;"  }, {8240, "&permil;"  }, {8242, "&prime;"   }, {8243, "&Prime;"   },
1299     { 8249, "&lsaquo;"  }, {8250, "&rsaquo;"  }, {8254, "&oline;"   }, {8260, "&frasl;"   },
1300     { 8364, "&euro;"    }, {8465, "&image;"   }, {8472, "&weierp;"  }, {8476, "&real;"    },
1301     { 8482, "&trade;"   }, {8501, "&alefsym;" }, {8592, "&larr;"    }, {8593, "&uarr;"    },
1302     { 8594, "&rarr;"    }, {8595, "&darr;"    }, {8596, "&harr;"    }, {8629, "&crarr;"   },
1303     { 8656, "&lArr;"    }, {8657, "&uArr;"    }, {8658, "&rArr;"    }, {8659, "&dArr;"    },
1304     { 8660, "&hArr;"    }, {8704, "&forall;"  }, {8706, "&part;"    }, {8707, "&exist;"   },
1305     { 8709, "&empty;"   }, {8711, "&nabla;"   }, {8712, "&isin;"    }, {8713, "&notin;"   },
1306     { 8715, "&ni;"      }, {8719, "&prod;"    }, {8721, "&sum;"     }, {8722, "&minus;"   },
1307     { 8727, "&lowast;"  }, {8730, "&radic;"   }, {8733, "&prop;"    }, {8734, "&infin;"   },
1308     { 8736, "&ang;"     }, {8743, "&and;"     }, {8744, "&or;"      }, {8745, "&cap;"     },
1309     { 8746, "&cup;"     }, {8747, "&int;"     }, {8756, "&there4;"  }, {8764, "&sim;"     },
1310     { 8773, "&cong;"    }, {8776, "&asymp;"   }, {8800, "&ne;"      }, {8801, "&equiv;"   },
1311     { 8804, "&le;"      }, {8805, "&ge;"      }, {8834, "&sub;"     }, {8835, "&sup;"     },
1312     { 8836, "&nsub;"    }, {8838, "&sube;"    }, {8839, "&supe;"    }, {8853, "&oplus;"   },
1313     { 8855, "&otimes;"  }, {8869, "&perp;"    }, {8901, "&sdot;"    }, {8968, "&lceil;"   },
1314     { 8969, "&rceil;"   }, {8970, "&lfloor;"  }, {8971, "&rfloor;"  }, {9001, "&lang;"    },
1315     { 9002, "&rang;"    }, {9674, "&loz;"     }, {9824, "&spades;"  }, {9827, "&clubs;"   },
1316     { 9829, "&hearts;"  }, {9830, "&diams;"   }
1317   };
1318   int num_chars = sizeof(charset) / sizeof(charset[0]);
1319 
1320   static struct {
1321     char *open_tag;
1322     char *uri_tag;
1323   }
1324   uritag[] = {
1325     {          "<a", "href"       }, {        "<img", "src"        }, {      "<input", "src"        },
1326     {     "<iframe", "src"        }, {      "<frame", "src"        }, {     "<script", "src"        },
1327     {       "<form", "action"     }, {      "<embed", "src"        }, {       "<area", "href"       },
1328     {       "<base", "href"       }, {       "<link", "href"       }, {     "<source", "src"        },
1329     {       "<body", "background" }, { "<blockquote", "cite"       }, {          "<q", "cite"       },
1330     {        "<ins", "cite"       }, {        "<del", "cite"       }
1331   };
1332   int num_uri = sizeof(uritag) / sizeof(uritag[0]);
1333 
1334   size_t len = strlen(html);
1335   html2 = malloc(len+1);
1336 
1337   if (html2 == NULL) {
1338     LOG (LOG_CRIT, ERR_MEM_ALLOC);
1339     return NULL;
1340   }
1341 
1342   for (i = 0; i < len; i++) {
1343     if (html[i] == '<') {
1344       if (cdata_close_tag) {
1345         if (strncasecmp(html + i, cdata_close_tag, strlen(cdata_close_tag)) == 0) {
1346           i += strlen(cdata_close_tag) - 1;
1347           cdata_close_tag = NULL;
1348         }
1349         continue;
1350       } else if (strncasecmp(html + i, "</td>", 5) == 0) {
1351         i += 4;
1352         closing_td_tag = 1;
1353         continue;
1354       } else if (strncasecmp(html + i, "<td", 3) == 0 && closing_td_tag) {
1355         if (j > 0 && !isspace(html2[j-1])) {
1356           html2[j++]=' ';
1357         }
1358         visible = 0;
1359       } else {
1360         closing_td_tag = 0;
1361         visible = 1;
1362       }
1363       k = i + 1;
1364 
1365       if ((k < len) && (!( (html[k] >= 65 && html[k] <= 90) ||
1366                            (html[k] >= 97 && html[k] <= 122) ||
1367                            (html[k] == 47) ||
1368                            (html[k] == 33) ))) {
1369         /* Not a HTML tag. HTML tags start with a letter, forwardslash or exclamation mark */
1370         visible = 1;
1371         html2[j++]=html[i];
1372         i = k;
1373         const char *w = &(html[k]);
1374         while (j < len && (size_t)(w - html) < len && *w != '<') {
1375           html2[j++]=*w;
1376           w++;
1377           i++;
1378         }
1379         continue;
1380       } else if (html[k]) {
1381         /* find the end of the tag */
1382         while (k < len && html[k] != '<' && html[k] != '>') {k++;}
1383 
1384         /* if we've got a tag with a uri, save the address to print later. */
1385         char *url_tag = " ";
1386         int tag_offset = 0, x = 0, y = 0;
1387         for (y = 0; y < num_uri; y++) {
1388           x = strlen(uritag[y].open_tag);
1389           if (strncasecmp(html+i,uritag[y].open_tag,x)==0 && (i+x < len && isspace(html[i+x]))) {
1390             url_tag = uritag[y].uri_tag;
1391             tag_offset = i + x + 1;
1392             break;
1393           }
1394         }
1395         /* tag with uri found */
1396         if (tag_offset > 0) {
1397           size_t url_start;         /* start of url tag inclusive [ */
1398           size_t url_tag_len = strlen(url_tag);
1399           char delim = ' ';
1400           /* find start of uri */
1401           for (url_start = tag_offset; url_start <= k; url_start++) {
1402             if (strncasecmp(html + url_start, url_tag, url_tag_len) == 0) {
1403               url_start += url_tag_len;
1404               while (html[url_start] && isspace(html[url_start])) {url_start++;}   /* remove spaces before = */
1405               if (html[url_start] == '=') {
1406                 url_start++;
1407                 while (html[url_start] && isspace(html[url_start])) {url_start++;} /* remove spaces after = */
1408                 if (html[url_start] == '"') {
1409                   delim = '"';
1410                   url_start++;
1411                 } else if (html[url_start] == '\'') {
1412                   delim = '\'';
1413                   url_start++;
1414                 } else {
1415                   delim = '>';
1416                 }
1417                 break;
1418               } else {
1419                 /* Start of uri tag found but no '=' after the tag.
1420                  * Skip the whole tag.
1421                  */
1422                 break;
1423               }
1424             } else if ((url_start - tag_offset) >= 50) {
1425               /* The length of the html tag is over 50 characters long without
1426                * finding the start of the url/uri. Skip the whole tag.
1427                */
1428               break;
1429             }
1430           }
1431           /* find end of uri */
1432           if (delim != ' ') {
1433             if (url_start < len &&
1434                 (strncasecmp(html + url_start, "http:", 5) == 0 ||
1435                  strncasecmp(html + url_start, "https:", 6) == 0 ||
1436                  strncasecmp(html + url_start, "ftp:", 4) == 0)) {
1437               html2[j++]=' ';
1438               const char *w = &(html[url_start]);
1439               /* html2 is a buffer of len + 1, where the +1 is for NULL
1440                * termination. This means we only want to loop to len
1441                * since we will replace html2[j] right after the loop.
1442                */
1443               while (j < len && (size_t)(w - html) < len && *w != delim) {
1444                 html2[j++]=*w;
1445                 w++;
1446               }
1447               html2[j++]=' ';
1448             }
1449           }
1450         } else if (strncasecmp(html + i, "<p>", 3) == 0
1451                 || strncasecmp(html + i, "<p ", 3) == 0
1452                 || strncasecmp(html + i, "<p\t", 3) == 0
1453                 || strncasecmp(html + i, "<tr", 3) == 0
1454                 || strncasecmp(html + i, "<option", 7) == 0
1455                 || strncasecmp(html + i, "<br", 3) == 0
1456                 || strncasecmp(html + i, "<li", 3) == 0
1457                 || strncasecmp(html + i, "<div", 4) == 0
1458                 || strncasecmp(html + i, "</select>", 9) == 0
1459                 || strncasecmp(html + i, "</table>", 8) == 0) {
1460           if (j > 0 && html2[j-1] != '\n' && html2[j-1] != '\r') {
1461             html2[j++] = '\n';
1462           }
1463         } else if (strncasecmp(html + i, "<applet", 7) == 0) {
1464           cdata_close_tag = "</applet>";
1465         } else if (strncasecmp(html + i, "<embed", 6) == 0) {
1466           cdata_close_tag = "</embed>";
1467         } else if (strncasecmp(html + i, "<frameset", 9) == 0) {
1468           cdata_close_tag = "</frameset>";
1469         } else if (strncasecmp(html + i, "<frame", 6) == 0) {
1470           cdata_close_tag = "</frame>";
1471         } else if (strncasecmp(html + i, "<iframe", 7) == 0) {
1472           cdata_close_tag = "</iframe>";
1473         } else if (strncasecmp(html + i, "<noembed", 8) == 0) {
1474           cdata_close_tag = "</noembed>";
1475         } else if (strncasecmp(html + i, "<noscript", 9) == 0) {
1476           cdata_close_tag = "</noscript>";
1477         } else if (strncasecmp(html + i, "<object", 7) == 0) {
1478           cdata_close_tag = "</object>";
1479         } else if (strncasecmp(html + i, "<script", 7) == 0) {
1480           cdata_close_tag = "</script>";
1481         } else if (strncasecmp(html + i, "<style", 6) == 0) {
1482           cdata_close_tag = "</style>";
1483         }
1484         i = (html[k] == '<' || html[k] == '\0')? k - 1: k;
1485         continue;
1486       }
1487     } else if (cdata_close_tag) {
1488       continue;
1489     } else if (!isspace(html[i])) {
1490       visible = 1;
1491     }
1492 
1493     if (strncmp(html+i,"&#",2)==0) {
1494       int x = 0;
1495       const char *w = &(html[i+2]);
1496       while (*w == '0') {i++;w++;}
1497       char n[5];
1498       if (html[i+4] && html[i+4] == ';'
1499           && isdigit(html[i+2])
1500           && isdigit(html[i+3])) {
1501         n[0] = html[i+2];
1502         n[1] = html[i+3];
1503         n[2] = 0;
1504         x = atoi(n);
1505         if (x <= 255 && x >= 32)
1506           html2[j++] = x;
1507         i += 4;
1508       } else if (html[i+6]
1509                   && html[i+6] == ';'
1510                   && isdigit(html[i+2])
1511                   && isdigit(html[i+3])
1512                   && isdigit(html[i+4])
1513                   && isdigit(html[i+5])) {
1514         n[0] = html[i+2];
1515         n[1] = html[i+3];
1516         n[2] = html[i+4];
1517         n[3] = html[i+5];
1518         n[4] = 0;
1519         x = atoi(n);
1520         if (x <= 255 && x >= 32)
1521           html2[j++] = x;
1522         i += 6;
1523       } else {
1524         const char *w = &(html[i]);
1525         while (*w != ';' && *w != ' ' && *w != '\t' && *w != '\0') {i++;w++;}
1526       }
1527       visible = 0;
1528       continue;
1529     } else if (html[i] == '&') {
1530       int x = 0, y = 0;
1531       for (y = 0; y < num_chars; y++) {
1532         x = strlen(charset[y].entity);
1533         if (strncasecmp(html+i,charset[y].entity,x)==0) {
1534           if (charset[y].id <= 255)
1535             html2[j++] = charset[y].id;
1536           i += x-1;
1537           visible = 0;
1538           continue;
1539         }
1540       }
1541     }
1542 
1543     if (j < len && visible)
1544       html2[j++] = html[i];
1545 
1546     if (j >= len)
1547       i = j = len;
1548   }
1549 
1550   html2[j] = '\0';
1551   return (char *)html2;
1552 }
1553