1 // Copyright 2009 The Archiveopteryx Developers <info@aox.org>
2 
3 #include "html.h"
4 
5 #include "utf.h"
6 #include "codec.h"
7 #include "ustring.h"
8 #include "entities.h"
9 
10 #include <ctype.h>
11 
12 
13 /*! \class HTML html.h
14     This class is responsible for extracting indexable text from HTML.
15     Its interface is subject to change once there are other classes to
16     do the same thing for other formats.
17 */
18 
19 
20 /*! Returns indexable text extracted from \a h. */
21 
asText(const UString & h)22 UString HTML::asText( const UString &h )
23 {
24     UString r;
25     UString t, s, qs, a;
26     char last = 0;
27     char quote = 0;
28     char c;
29     uint mark = 0;
30     AsciiCodec dc; // just a dummy so we can use Codec::append()
31 
32     int tag = 0;        /* 1 inside <...> */
33     int tagname = 0;    /* 1 inside tag, before whitespace */
34     int sgml = 0;       /* 1 inside <[!?]...> */
35     int quoted = 0;     /* 1 inside <foo bar="..."> */
36 
37     uint i = 0;
38     while ( i < h.length() ) {
39         /* Each case below sets i to the position of the last character
40            it processed. */
41         switch ( h[i] ) {
42         case '<':
43             if ( quoted )
44                 goto next;
45             if ( h[i+1] == '!' || h[i+1] == '?' ) {
46                 sgml = 1;
47                 i++;
48             }
49             tag = 1;
50             tagname = 1;
51             t.truncate();
52             break;
53 
54         case '>':
55             if ( quoted )
56                 goto next;
57             if ( tag ) {
58                 //t = t.lower();
59                 if ( t == "p" ) {
60                     s.append( '\n' );
61                     s.append( '\n' );
62                 }
63                 else if ( t == "br" ) {
64                     s.append( '\n' );
65                 }
66                 else if ( t == "body" ) {
67                     r.truncate();
68                 }
69                 sgml = tag = 0;
70             }
71             break;
72 
73         case '-':
74             if ( !sgml )
75                 goto unspecial;
76             if ( quoted && quote != '-' )
77                 goto next;
78             if ( last == '-' ) {
79                 quote = '-';
80                 quoted = !quoted;
81             }
82             break;
83 
84         case '"':
85         case '\'':
86             if ( !tag )
87                 goto unspecial;
88             if ( quoted && quote == h[i] ) {
89                 quoted = 0;
90             } else if ( !quoted && last == '=' ) {
91                 quoted = 1;
92                 quote = h[i];
93                 qs.truncate();
94             }
95             break;
96 
97         case ' ':
98         case '\t':
99         case '\r':
100         case '\n':
101             /* Whitespace shouldn't appear in last, and we compress it
102                to one space. */
103             if ( !tag && s.isEmpty() )
104                 s.append( ' ' );
105             tagname = false;
106             a.truncate();
107             i++;
108             continue;
109             break;
110 
111         case '&':
112             /* May be a character reference. */
113             if ( ( c = h[i+1] ) == '#' ) {
114                 char d = h[i+2] | 0x20;
115 
116                 if ( isdigit( d ) ) {
117                     /* Decimal numeric reference: &#[0-9]+;? */
118                     i += 2;
119                     mark = i++;
120                     while ( isdigit( h[i] ) )
121                         i++;
122                     r.append( s );
123                     dc.append( r, h.mid( mark, i-mark ).number( 0 ) );
124                     s.truncate();
125 
126                     /* The terminating semicolon is required only
127                        where the next character would otherwise be
128                        interpreted as a part of the reference. */
129                     if ( h[i] != ';' )
130                         i--;
131                 }
132                 else if ( d == 'x' ) {
133                     /* Hexadecimal numeric reference: &#[xX][0-9A-Za-z]+;? */
134                     i += 2;
135                     mark = ++i;
136                     while ( isxdigit( h[i] ) )
137                         i++;
138                     if ( i != mark ) {
139                         r.append( s );
140                         dc.append( r, h.mid( mark, i-mark ).number( 0, 16 ) );
141                         s.truncate();
142                     }
143                     if ( h[i] != ';' )
144                         i--;
145                 }
146                 else {
147                     /* Not a reference. */
148                     i++;
149                     r.append( s );
150                     r.append( '&' );
151                     r.append( '#' );
152                     s.truncate();
153                 }
154             } else if ( isalpha( c ) ) {
155                 /* Entity reference: &[a-zA-Z0-9]+;? */
156                 i++;
157                 mark = i++;
158                 while ( isalnum( h[i] ) )
159                     i++;
160                 UString ent( h.mid( mark, i-mark ) );
161                 if ( h[i] != ';' )
162                     i--;
163 
164                 int n = 0;
165                 while ( n < ents ) {
166                     uint l = 0;
167                     bool match = true;
168                     while ( l < ent.length() ) {
169                         if ( ent[l] != entities[n].name[l] ) {
170                             match = false;
171                             break;
172                         }
173                         l++;
174                     }
175 
176                     if ( match && entities[n].name[l] == '\0' ) {
177                         r.append( s );
178                         r.append( entities[n].chr );
179                         s.truncate();
180                         break;
181                     }
182 
183                     n++;
184                 }
185             }
186             else {
187                 /* Not a reference. */
188                 r.append( s );
189                 r.append( '&' );
190                 s.truncate();
191             }
192             break;
193 
194     unspecial:
195         default:
196             if ( !tag ) {
197                 r.append( s );
198                 dc.append( r, h[i] );
199                 s.truncate();
200             } else if ( tagname ) {
201                 t.append( h[i] );
202             } else if ( !quoted && h[i] == '=' ) {
203                 a.truncate();
204             } else {
205                 a.append( h[i] );
206             }
207             break;
208         }
209 
210     next:
211         last = h[i];
212         i++;
213     }
214 
215     dc.mangleTrailingSurrogate( r );
216     // we ignore dc.state()
217     return r;
218 }
219