1 /* ----------------------------------------------------------------------
2      MIME Mangler - single pass reduction of MIME to plain text
3 
4      Laurence Lundblade <lgl@qualcomm.com>
5 
6      Copyright 1997,1998,1999 QUALCOMM Incorporated
7 
8      File: striphtml.c
9      Version: 0.3.0, May 1999
10      Last Edited:
11 
12   ---- */
13 
14 /* Original header; before LGL hacking */
15 /*
16 || File:    striphtml.c - HTML-to-text converter.
17 || Author:  Brian Kelley, Qualcomm Inc.
18 || Notice:  Copyright 1997 Qualcomm, Inc.
19 || Date:    14-Sep-97
20 || Hacked by: Laurence Lundblade
21 ||
22 || Handles:
23 || - SGML tags and comments, including quoted ">" in attribute values
24 || - SGML numeric references (eg: &#xx;)
25 || - HTML entities (eg: &amp;)
26 || - stripping redundant spaces
27 || - Paragraph breaks caused by HTML open and close elements
28 || - HTML preformatted blocks (PRE, LISTING, XMP, PLAINTEXT)
29 || - Special processing escapes: XMP and PLAINTEXT
30 || - stripping contents of certain HTML elements (COMMENT, SCRIPT,
31 ||    SELECT, TITLE).
32 || - LI items marked with a bullet character (not ISO8859, but
33 ||    compatible with Windows, Mac, Pilot ... I hope)
34 ||
35 || This does not_ try to:
36 || - Wrap text to a given width
37 || - Mimic HTML indentation using spaces
38 */
39 
40 #ifndef _STRIPHTMLINCLUDED
41 #define _STRIPHTMLINCLUDED
42 #include "config.h"
43 #include <mime.h>
44 #include "charmangle.h"
45 typedef struct Html Html;
46 
47 #define URL_BUF_SIZE 1024
48 
49 
50 /*
51 || This callback is used by the Html object to output processed text.
52 */
53 
54 /* typedef void (*OutputFn)(void *pv, const char *buf, int len); */
55 
56 
57 /*
58  || For static and automatic allocation of Html objects, the structure
59  || definition is included in this header file.  If using dynamic
60  || allocation for Html objects, Html_New() and Html_Delete() can be used
61  || instead of Html_Ctor() and Html_Dtor(), and the structure definition
62  || can be moved out of the header file into striphtml.c.  That way,
63  || client code will have absolutely no dependence on Html's internal
64  || structure linked into it.
65  */
66 
67 extern void Html_Ctor __PROTO((Html *me, OutputFn pfOut, void *pvOut));
68 extern void Html_Dtor __PROTO(());
69 
70 /*
71 || This is the function you use to feed data to the Html object.
72 */
73 extern void Html_Write __PROTO((Html *me, const char *szText, int cbText));
74 
75 
76 
77 #define DATAMAX 12     /* maximum number of characters to hold element name */
78 
79 #define ATTRMAX 20     /* Maximum character from a parameter to save */
80 
81 
82 struct Html {
83    OutputFn *pfOut;
84    void *   pvOut;
85 
86    int      nState;
87    int      chQuote;	     /* terminating quote character when in
88                                 ssValueQuoted */
89    int      cntDashes;       /* used in ssComment state to test for
90                                 termination of comment */
91    int      ndxData;         /* next index into szData */
92    char     szData[DATAMAX]; /* current tag name */
93    char     szXMP[DATAMAX];  /* special non-SGML mode for HTML compatibility */
94 
95    /* Text parsing state */
96    int      nTextState;
97    int      nBreaks;
98    int      nCol;
99 
100    /* Save an attribute */
101    int      nattrName;
102    char     attrName[DATAMAX];
103    int      nattrVal;
104    char     attrVal[ATTRMAX];
105 };
106 
107 
108 void     *textHTMLInit __PROTO((OutputFn oFn, void *oFnState,
109 				TextCharSetType partCharSet,
110 				TextCharSetType reqCharSet));
111 OutputFn  textHTMLToPlain;
112 
113 
114 /* ----------------------------------------------------------------------
115    Conversion of text/plain to HTML
116    ---- */
117 OutputFn  textPlainToHTMLHead;
118 OutputFn  textPlainToHTML;
119 OutputFn  textPlainToHTMLTail;
120 
121 
122 /* ----------------------------------------------------------------------
123    Conversion of text/plain format=flowed to HTML
124    ---- */
125 struct Flowed2HTML {
126   /* Where to send the output next */
127   OutputFn *pfOut;
128   void *   pvOut;
129 
130   /* Are we in a paragraph? */
131   int      m_bInPara;
132 
133   /* Count quote depth   */
134   int      m_nLastQuoteDepth;
135   int      m_nQuoteDepth;
136 
137   int      m_bLook4Quote;
138 
139   /* Are we in a line's content, or at its start? */
140   enum {AT_START,
141         IN_CONTENT} m_nLineState;
142 
143   /* Track runs of spaces (including trailing spaces) */
144   int      m_nSpaceCount;
145 
146   /* Treat signature separator line specially */
147   enum {NO_DASH,
148         FIRST_DASH,
149         SECOND_DASH,
150         DASH_DASH_SPACE} m_nSigSepState;
151 
152 #ifdef _TRAP_URLS
153   /* Buffer potential URLs */
154   char     m_cURLbuf [ URL_BUF_SIZE ];
155   int      m_nURLcount;
156 #endif
157 
158 };
159 
160 void     *textFlowedToHTMLInit  __PROTO((OutputFn oFn, void *oFnState));
161 OutputFn  textFlowedToHTMLHead;
162 void      ffToHTML_lineStart    __PROTO((struct Flowed2HTML *me));
163 OutputFn  textFlowedToHTML;
164 OutputFn  textFlowedToHTMLTail;
165 
166 
167 #endif /*_ MIMEMANGLEINCLUDED */
168 
169