1 /* ---------------------------------------------------------------------- 2 MIME Mangler - single pass reduction of MIME to plain text 3 4 Laurence Lundblade <lgl@qualcomm.com> 5 6 Copyright 1997,1998,1999 QUALCOMM Incorporated 7 8 File: striphtml.c 9 Version: 0.3.0, May 1999 10 Last Edited: 11 12 ---- */ 13 14 /* Original header; before LGL hacking */ 15 /* 16 || File: striphtml.c - HTML-to-text converter. 17 || Author: Brian Kelley, Qualcomm Inc. 18 || Notice: Copyright 1997 Qualcomm, Inc. 19 || Date: 14-Sep-97 20 || Hacked by: Laurence Lundblade 21 || 22 || Handles: 23 || - SGML tags and comments, including quoted ">" in attribute values 24 || - SGML numeric references (eg: &#xx;) 25 || - HTML entities (eg: &) 26 || - stripping redundant spaces 27 || - Paragraph breaks caused by HTML open and close elements 28 || - HTML preformatted blocks (PRE, LISTING, XMP, PLAINTEXT) 29 || - Special processing escapes: XMP and PLAINTEXT 30 || - stripping contents of certain HTML elements (COMMENT, SCRIPT, 31 || SELECT, TITLE). 32 || - LI items marked with a bullet character (not ISO8859, but 33 || compatible with Windows, Mac, Pilot ... I hope) 34 || 35 || This does not_ try to: 36 || - Wrap text to a given width 37 || - Mimic HTML indentation using spaces 38 */ 39 40 #ifndef _STRIPHTMLINCLUDED 41 #define _STRIPHTMLINCLUDED 42 #include "config.h" 43 #include <mime.h> 44 #include "charmangle.h" 45 typedef struct Html Html; 46 47 #define URL_BUF_SIZE 1024 48 49 50 /* 51 || This callback is used by the Html object to output processed text. 52 */ 53 54 /* typedef void (*OutputFn)(void *pv, const char *buf, int len); */ 55 56 57 /* 58 || For static and automatic allocation of Html objects, the structure 59 || definition is included in this header file. If using dynamic 60 || allocation for Html objects, Html_New() and Html_Delete() can be used 61 || instead of Html_Ctor() and Html_Dtor(), and the structure definition 62 || can be moved out of the header file into striphtml.c. That way, 63 || client code will have absolutely no dependence on Html's internal 64 || structure linked into it. 65 */ 66 67 extern void Html_Ctor __PROTO((Html *me, OutputFn pfOut, void *pvOut)); 68 extern void Html_Dtor __PROTO(()); 69 70 /* 71 || This is the function you use to feed data to the Html object. 72 */ 73 extern void Html_Write __PROTO((Html *me, const char *szText, int cbText)); 74 75 76 77 #define DATAMAX 12 /* maximum number of characters to hold element name */ 78 79 #define ATTRMAX 20 /* Maximum character from a parameter to save */ 80 81 82 struct Html { 83 OutputFn *pfOut; 84 void * pvOut; 85 86 int nState; 87 int chQuote; /* terminating quote character when in 88 ssValueQuoted */ 89 int cntDashes; /* used in ssComment state to test for 90 termination of comment */ 91 int ndxData; /* next index into szData */ 92 char szData[DATAMAX]; /* current tag name */ 93 char szXMP[DATAMAX]; /* special non-SGML mode for HTML compatibility */ 94 95 /* Text parsing state */ 96 int nTextState; 97 int nBreaks; 98 int nCol; 99 100 /* Save an attribute */ 101 int nattrName; 102 char attrName[DATAMAX]; 103 int nattrVal; 104 char attrVal[ATTRMAX]; 105 }; 106 107 108 void *textHTMLInit __PROTO((OutputFn oFn, void *oFnState, 109 TextCharSetType partCharSet, 110 TextCharSetType reqCharSet)); 111 OutputFn textHTMLToPlain; 112 113 114 /* ---------------------------------------------------------------------- 115 Conversion of text/plain to HTML 116 ---- */ 117 OutputFn textPlainToHTMLHead; 118 OutputFn textPlainToHTML; 119 OutputFn textPlainToHTMLTail; 120 121 122 /* ---------------------------------------------------------------------- 123 Conversion of text/plain format=flowed to HTML 124 ---- */ 125 struct Flowed2HTML { 126 /* Where to send the output next */ 127 OutputFn *pfOut; 128 void * pvOut; 129 130 /* Are we in a paragraph? */ 131 int m_bInPara; 132 133 /* Count quote depth */ 134 int m_nLastQuoteDepth; 135 int m_nQuoteDepth; 136 137 int m_bLook4Quote; 138 139 /* Are we in a line's content, or at its start? */ 140 enum {AT_START, 141 IN_CONTENT} m_nLineState; 142 143 /* Track runs of spaces (including trailing spaces) */ 144 int m_nSpaceCount; 145 146 /* Treat signature separator line specially */ 147 enum {NO_DASH, 148 FIRST_DASH, 149 SECOND_DASH, 150 DASH_DASH_SPACE} m_nSigSepState; 151 152 #ifdef _TRAP_URLS 153 /* Buffer potential URLs */ 154 char m_cURLbuf [ URL_BUF_SIZE ]; 155 int m_nURLcount; 156 #endif 157 158 }; 159 160 void *textFlowedToHTMLInit __PROTO((OutputFn oFn, void *oFnState)); 161 OutputFn textFlowedToHTMLHead; 162 void ffToHTML_lineStart __PROTO((struct Flowed2HTML *me)); 163 OutputFn textFlowedToHTML; 164 OutputFn textFlowedToHTMLTail; 165 166 167 #endif /*_ MIMEMANGLEINCLUDED */ 168 169