1 // Copyright 2009 The Archiveopteryx Developers <info@aox.org>
2
3 #include "html.h"
4
5 #include "utf.h"
6 #include "codec.h"
7 #include "ustring.h"
8 #include "entities.h"
9
10 #include <ctype.h>
11
12
13 /*! \class HTML html.h
14 This class is responsible for extracting indexable text from HTML.
15 Its interface is subject to change once there are other classes to
16 do the same thing for other formats.
17 */
18
19
20 /*! Returns indexable text extracted from \a h. */
21
asText(const UString & h)22 UString HTML::asText( const UString &h )
23 {
24 UString r;
25 UString t, s, qs, a;
26 char last = 0;
27 char quote = 0;
28 char c;
29 uint mark = 0;
30 AsciiCodec dc; // just a dummy so we can use Codec::append()
31
32 int tag = 0; /* 1 inside <...> */
33 int tagname = 0; /* 1 inside tag, before whitespace */
34 int sgml = 0; /* 1 inside <[!?]...> */
35 int quoted = 0; /* 1 inside <foo bar="..."> */
36
37 uint i = 0;
38 while ( i < h.length() ) {
39 /* Each case below sets i to the position of the last character
40 it processed. */
41 switch ( h[i] ) {
42 case '<':
43 if ( quoted )
44 goto next;
45 if ( h[i+1] == '!' || h[i+1] == '?' ) {
46 sgml = 1;
47 i++;
48 }
49 tag = 1;
50 tagname = 1;
51 t.truncate();
52 break;
53
54 case '>':
55 if ( quoted )
56 goto next;
57 if ( tag ) {
58 //t = t.lower();
59 if ( t == "p" ) {
60 s.append( '\n' );
61 s.append( '\n' );
62 }
63 else if ( t == "br" ) {
64 s.append( '\n' );
65 }
66 else if ( t == "body" ) {
67 r.truncate();
68 }
69 sgml = tag = 0;
70 }
71 break;
72
73 case '-':
74 if ( !sgml )
75 goto unspecial;
76 if ( quoted && quote != '-' )
77 goto next;
78 if ( last == '-' ) {
79 quote = '-';
80 quoted = !quoted;
81 }
82 break;
83
84 case '"':
85 case '\'':
86 if ( !tag )
87 goto unspecial;
88 if ( quoted && quote == h[i] ) {
89 quoted = 0;
90 } else if ( !quoted && last == '=' ) {
91 quoted = 1;
92 quote = h[i];
93 qs.truncate();
94 }
95 break;
96
97 case ' ':
98 case '\t':
99 case '\r':
100 case '\n':
101 /* Whitespace shouldn't appear in last, and we compress it
102 to one space. */
103 if ( !tag && s.isEmpty() )
104 s.append( ' ' );
105 tagname = false;
106 a.truncate();
107 i++;
108 continue;
109 break;
110
111 case '&':
112 /* May be a character reference. */
113 if ( ( c = h[i+1] ) == '#' ) {
114 char d = h[i+2] | 0x20;
115
116 if ( isdigit( d ) ) {
117 /* Decimal numeric reference: &#[0-9]+;? */
118 i += 2;
119 mark = i++;
120 while ( isdigit( h[i] ) )
121 i++;
122 r.append( s );
123 dc.append( r, h.mid( mark, i-mark ).number( 0 ) );
124 s.truncate();
125
126 /* The terminating semicolon is required only
127 where the next character would otherwise be
128 interpreted as a part of the reference. */
129 if ( h[i] != ';' )
130 i--;
131 }
132 else if ( d == 'x' ) {
133 /* Hexadecimal numeric reference: &#[xX][0-9A-Za-z]+;? */
134 i += 2;
135 mark = ++i;
136 while ( isxdigit( h[i] ) )
137 i++;
138 if ( i != mark ) {
139 r.append( s );
140 dc.append( r, h.mid( mark, i-mark ).number( 0, 16 ) );
141 s.truncate();
142 }
143 if ( h[i] != ';' )
144 i--;
145 }
146 else {
147 /* Not a reference. */
148 i++;
149 r.append( s );
150 r.append( '&' );
151 r.append( '#' );
152 s.truncate();
153 }
154 } else if ( isalpha( c ) ) {
155 /* Entity reference: &[a-zA-Z0-9]+;? */
156 i++;
157 mark = i++;
158 while ( isalnum( h[i] ) )
159 i++;
160 UString ent( h.mid( mark, i-mark ) );
161 if ( h[i] != ';' )
162 i--;
163
164 int n = 0;
165 while ( n < ents ) {
166 uint l = 0;
167 bool match = true;
168 while ( l < ent.length() ) {
169 if ( ent[l] != entities[n].name[l] ) {
170 match = false;
171 break;
172 }
173 l++;
174 }
175
176 if ( match && entities[n].name[l] == '\0' ) {
177 r.append( s );
178 r.append( entities[n].chr );
179 s.truncate();
180 break;
181 }
182
183 n++;
184 }
185 }
186 else {
187 /* Not a reference. */
188 r.append( s );
189 r.append( '&' );
190 s.truncate();
191 }
192 break;
193
194 unspecial:
195 default:
196 if ( !tag ) {
197 r.append( s );
198 dc.append( r, h[i] );
199 s.truncate();
200 } else if ( tagname ) {
201 t.append( h[i] );
202 } else if ( !quoted && h[i] == '=' ) {
203 a.truncate();
204 } else {
205 a.append( h[i] );
206 }
207 break;
208 }
209
210 next:
211 last = h[i];
212 i++;
213 }
214
215 dc.mangleTrailingSurrogate( r );
216 // we ignore dc.state()
217 return r;
218 }
219