1 /////////////////////////////////////////////////////////////////////////////
2 // Name:        src/html/htmlpars.cpp
3 // Purpose:     wxHtmlParser class (generic parser)
4 // Author:      Vaclav Slavik
5 // RCS-ID:      $Id: htmlpars.cpp 66413 2010-12-20 17:40:05Z JS $
6 // Copyright:   (c) 1999 Vaclav Slavik
7 // Licence:     wxWindows licence
8 /////////////////////////////////////////////////////////////////////////////
9 
10 #include "wx/wxprec.h"
11 
12 #ifdef __BORLANDC__
13     #pragma hdrstop
14 #endif
15 
16 #if wxUSE_HTML && wxUSE_STREAMS
17 
18 #ifndef WXPRECOMP
19     #include "wx/dynarray.h"
20     #include "wx/log.h"
21     #include "wx/intl.h"
22     #include "wx/app.h"
23 #endif
24 
25 #include "wx/tokenzr.h"
26 #include "wx/wfstream.h"
27 #include "wx/url.h"
28 #include "wx/fontmap.h"
29 #include "wx/html/htmldefs.h"
30 #include "wx/html/htmlpars.h"
31 #include "wx/arrimpl.cpp"
32 
33 #ifdef __WXWINCE__
34     #include "wx/msw/wince/missing.h"       // for bsearch()
35 #endif
36 
37 // DLL options compatibility check:
38 WX_CHECK_BUILD_OPTIONS("wxHTML")
39 
40 const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
41 
42 //-----------------------------------------------------------------------------
43 // wxHtmlParser helpers
44 //-----------------------------------------------------------------------------
45 
46 class wxHtmlTextPiece
47 {
48 public:
wxHtmlTextPiece(int pos,int lng)49     wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
50     int m_pos, m_lng;
51 };
52 
53 WX_DECLARE_OBJARRAY(wxHtmlTextPiece, wxHtmlTextPieces);
54 WX_DEFINE_OBJARRAY(wxHtmlTextPieces)
55 
56 class wxHtmlParserState
57 {
58 public:
59     wxHtmlTag         *m_curTag;
60     wxHtmlTag         *m_tags;
61     wxHtmlTextPieces  *m_textPieces;
62     int                m_curTextPiece;
63     wxString           m_source;
64     wxHtmlParserState *m_nextState;
65 };
66 
67 //-----------------------------------------------------------------------------
68 // wxHtmlParser
69 //-----------------------------------------------------------------------------
70 
IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)71 IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
72 
73 wxHtmlParser::wxHtmlParser()
74     : wxObject(), m_HandlersHash(wxKEY_STRING),
75       m_FS(NULL), m_HandlersStack(NULL)
76 {
77     m_entitiesParser = new wxHtmlEntitiesParser;
78     m_Tags = NULL;
79     m_CurTag = NULL;
80     m_TextPieces = NULL;
81     m_CurTextPiece = 0;
82     m_SavedStates = NULL;
83 }
84 
~wxHtmlParser()85 wxHtmlParser::~wxHtmlParser()
86 {
87     while (RestoreState()) {}
88     DestroyDOMTree();
89 
90     if (m_HandlersStack)
91     {
92         wxList& tmp = *m_HandlersStack;
93         wxList::iterator it, en;
94         for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
95             delete (wxHashTable*)*it;
96         tmp.clear();
97     }
98     delete m_HandlersStack;
99     m_HandlersHash.Clear();
100     WX_CLEAR_LIST(wxList, m_HandlersList);
101     delete m_entitiesParser;
102 }
103 
Parse(const wxString & source)104 wxObject* wxHtmlParser::Parse(const wxString& source)
105 {
106     InitParser(source);
107     DoParsing();
108     wxObject *result = GetProduct();
109     DoneParser();
110     return result;
111 }
112 
InitParser(const wxString & source)113 void wxHtmlParser::InitParser(const wxString& source)
114 {
115     SetSource(source);
116     m_stopParsing = false;
117 }
118 
DoneParser()119 void wxHtmlParser::DoneParser()
120 {
121     DestroyDOMTree();
122 }
123 
SetSource(const wxString & src)124 void wxHtmlParser::SetSource(const wxString& src)
125 {
126     DestroyDOMTree();
127     m_Source = src;
128     CreateDOMTree();
129     m_CurTag = NULL;
130     m_CurTextPiece = 0;
131 }
132 
CreateDOMTree()133 void wxHtmlParser::CreateDOMTree()
134 {
135     wxHtmlTagsCache cache(m_Source);
136     m_TextPieces = new wxHtmlTextPieces;
137     CreateDOMSubTree(NULL, 0, m_Source.length(), &cache);
138     m_CurTextPiece = 0;
139 }
140 
141 extern bool wxIsCDATAElement(const wxChar *tag);
142 
CreateDOMSubTree(wxHtmlTag * cur,int begin_pos,int end_pos,wxHtmlTagsCache * cache)143 void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
144                                     int begin_pos, int end_pos,
145                                     wxHtmlTagsCache *cache)
146 {
147     if (end_pos <= begin_pos) return;
148 
149     wxChar c;
150     int i = begin_pos;
151     int textBeginning = begin_pos;
152 
153     // If the tag contains CDATA text, we include the text between beginning
154     // and ending tag verbosely. Setting i=end_pos will skip to the very
155     // end of this function where text piece is added, bypassing any child
156     // tags parsing (CDATA element can't have child elements by definition):
157     if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
158     {
159         i = end_pos;
160     }
161 
162     while (i < end_pos)
163     {
164         c = m_Source.GetChar(i);
165 
166         if (c == wxT('<'))
167         {
168             // add text to m_TextPieces:
169             if (i - textBeginning > 0)
170                 m_TextPieces->Add(
171                     wxHtmlTextPiece(textBeginning, i - textBeginning));
172 
173             // if it is a comment, skip it:
174             if (i < end_pos-6 && m_Source.GetChar(i+1) == wxT('!') &&
175                                  m_Source.GetChar(i+2) == wxT('-') &&
176                                  m_Source.GetChar(i+3) == wxT('-'))
177             {
178                 // Comments begin with "<!--" and end with "--[ \t\r\n]*>"
179                 // according to HTML 4.0
180                 int dashes = 0;
181                 i += 4;
182                 while (i < end_pos)
183                 {
184                     c = m_Source.GetChar(i++);
185                     if ((c == wxT(' ') || c == wxT('\n') ||
186                         c == wxT('\r') || c == wxT('\t')) && dashes >= 2) {}
187                     else if (c == wxT('>') && dashes >= 2)
188                     {
189                         textBeginning = i;
190                         break;
191                     }
192                     else if (c == wxT('-'))
193                         dashes++;
194                     else
195                         dashes = 0;
196                 }
197             }
198 
199             // add another tag to the tree:
200             else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
201             {
202                 wxHtmlTag *chd;
203                 if (cur)
204                     chd = new wxHtmlTag(cur, m_Source,
205                                         i, end_pos, cache, m_entitiesParser);
206                 else
207                 {
208                     chd = new wxHtmlTag(NULL, m_Source,
209                                         i, end_pos, cache, m_entitiesParser);
210                     if (!m_Tags)
211                     {
212                         // if this is the first tag to be created make the root
213                         // m_Tags point to it:
214                         m_Tags = chd;
215                     }
216                     else
217                     {
218                         // if there is already a root tag add this tag as
219                         // the last sibling:
220                         chd->m_Prev = m_Tags->GetLastSibling();
221                         chd->m_Prev->m_Next = chd;
222                     }
223                 }
224 
225                 if (chd->HasEnding())
226                 {
227                     CreateDOMSubTree(chd,
228                                      chd->GetBeginPos(), chd->GetEndPos1(),
229                                      cache);
230                     i = chd->GetEndPos2();
231                 }
232                 else
233                     i = chd->GetBeginPos();
234 
235                 textBeginning = i;
236             }
237 
238             // ... or skip ending tag:
239             else
240             {
241                 while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
242                 textBeginning = i+1;
243             }
244         }
245         else i++;
246     }
247 
248     // add remaining text to m_TextPieces:
249     if (end_pos - textBeginning > 0)
250         m_TextPieces->Add(
251             wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
252 }
253 
DestroyDOMTree()254 void wxHtmlParser::DestroyDOMTree()
255 {
256     wxHtmlTag *t1, *t2;
257     t1 = m_Tags;
258     while (t1)
259     {
260         t2 = t1->GetNextSibling();
261         delete t1;
262         t1 = t2;
263     }
264     m_Tags = m_CurTag = NULL;
265 
266     delete m_TextPieces;
267     m_TextPieces = NULL;
268 }
269 
DoParsing()270 void wxHtmlParser::DoParsing()
271 {
272     m_CurTag = m_Tags;
273     m_CurTextPiece = 0;
274     DoParsing(0, m_Source.length());
275 }
276 
DoParsing(int begin_pos,int end_pos)277 void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
278 {
279     if (end_pos <= begin_pos) return;
280 
281     wxHtmlTextPieces& pieces = *m_TextPieces;
282     size_t piecesCnt = pieces.GetCount();
283 
284     while (begin_pos < end_pos)
285     {
286         while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
287             m_CurTag = m_CurTag->GetNextTag();
288         while (m_CurTextPiece < piecesCnt &&
289                pieces[m_CurTextPiece].m_pos < begin_pos)
290             m_CurTextPiece++;
291 
292         if (m_CurTextPiece < piecesCnt &&
293             (!m_CurTag ||
294              pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
295         {
296             // Add text:
297             AddText(GetEntitiesParser()->Parse(
298                        m_Source.Mid(pieces[m_CurTextPiece].m_pos,
299                                     pieces[m_CurTextPiece].m_lng)));
300             begin_pos = pieces[m_CurTextPiece].m_pos +
301                         pieces[m_CurTextPiece].m_lng;
302             m_CurTextPiece++;
303         }
304         else if (m_CurTag)
305         {
306             if (m_CurTag->HasEnding())
307                 begin_pos = m_CurTag->GetEndPos2();
308             else
309                 begin_pos = m_CurTag->GetBeginPos();
310             wxHtmlTag *t = m_CurTag;
311             m_CurTag = m_CurTag->GetNextTag();
312             AddTag(*t);
313             if (m_stopParsing)
314                 return;
315         }
316         else break;
317     }
318 }
319 
AddTag(const wxHtmlTag & tag)320 void wxHtmlParser::AddTag(const wxHtmlTag& tag)
321 {
322     wxHtmlTagHandler *h;
323     bool inner = false;
324 
325     h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
326     if (h)
327     {
328         inner = h->HandleTag(tag);
329         if (m_stopParsing)
330             return;
331     }
332     if (!inner)
333     {
334         if (tag.HasEnding())
335             DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
336     }
337 }
338 
AddTagHandler(wxHtmlTagHandler * handler)339 void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
340 {
341     wxString s(handler->GetSupportedTags());
342     wxStringTokenizer tokenizer(s, wxT(", "));
343 
344     while (tokenizer.HasMoreTokens())
345         m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
346 
347     if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
348         m_HandlersList.Append(handler);
349 
350     handler->SetParser(this);
351 }
352 
PushTagHandler(wxHtmlTagHandler * handler,const wxString & tags)353 void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags)
354 {
355     wxStringTokenizer tokenizer(tags, wxT(", "));
356     wxString key;
357 
358     if (m_HandlersStack == NULL)
359     {
360         m_HandlersStack = new wxList;
361     }
362 
363     m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
364 
365     while (tokenizer.HasMoreTokens())
366     {
367         key = tokenizer.GetNextToken();
368         m_HandlersHash.Delete(key);
369         m_HandlersHash.Put(key, handler);
370     }
371 }
372 
PopTagHandler()373 void wxHtmlParser::PopTagHandler()
374 {
375     wxList::compatibility_iterator first;
376 
377     if ( !m_HandlersStack ||
378 #if wxUSE_STL
379          !(first = m_HandlersStack->GetFirst())
380 #else // !wxUSE_STL
381          ((first = m_HandlersStack->GetFirst()) == NULL)
382 #endif // wxUSE_STL/!wxUSE_STL
383         )
384     {
385         wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
386         return;
387     }
388     m_HandlersHash = *((wxHashTable*) first->GetData());
389     delete (wxHashTable*) first->GetData();
390     m_HandlersStack->Erase(first);
391 }
392 
SetSourceAndSaveState(const wxString & src)393 void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
394 {
395     wxHtmlParserState *s = new wxHtmlParserState;
396 
397     s->m_curTag = m_CurTag;
398     s->m_tags = m_Tags;
399     s->m_textPieces = m_TextPieces;
400     s->m_curTextPiece = m_CurTextPiece;
401     s->m_source = m_Source;
402 
403     s->m_nextState = m_SavedStates;
404     m_SavedStates = s;
405 
406     m_CurTag = NULL;
407     m_Tags = NULL;
408     m_TextPieces = NULL;
409     m_CurTextPiece = 0;
410     m_Source = wxEmptyString;
411 
412     SetSource(src);
413 }
414 
RestoreState()415 bool wxHtmlParser::RestoreState()
416 {
417     if (!m_SavedStates) return false;
418 
419     DestroyDOMTree();
420 
421     wxHtmlParserState *s = m_SavedStates;
422     m_SavedStates = s->m_nextState;
423 
424     m_CurTag = s->m_curTag;
425     m_Tags = s->m_tags;
426     m_TextPieces = s->m_textPieces;
427     m_CurTextPiece = s->m_curTextPiece;
428     m_Source = s->m_source;
429 
430     delete s;
431     return true;
432 }
433 
GetInnerSource(const wxHtmlTag & tag)434 wxString wxHtmlParser::GetInnerSource(const wxHtmlTag& tag)
435 {
436     return GetSource()->Mid(tag.GetBeginPos(),
437                             tag.GetEndPos1() - tag.GetBeginPos());
438 }
439 
440 //-----------------------------------------------------------------------------
441 // wxHtmlTagHandler
442 //-----------------------------------------------------------------------------
443 
IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)444 IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
445 
446 void wxHtmlTagHandler::ParseInnerSource(const wxString& source)
447 {
448     // It is safe to temporarily change the source being parsed,
449     // provided we restore the state back after parsing
450     m_Parser->SetSourceAndSaveState(source);
451     m_Parser->DoParsing();
452     m_Parser->RestoreState();
453 }
454 
455 
456 //-----------------------------------------------------------------------------
457 // wxHtmlEntitiesParser
458 //-----------------------------------------------------------------------------
459 
IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)460 IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
461 
462 wxHtmlEntitiesParser::wxHtmlEntitiesParser()
463 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
464     : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
465 #endif
466 {
467 }
468 
~wxHtmlEntitiesParser()469 wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
470 {
471 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
472     delete m_conv;
473 #endif
474 }
475 
SetEncoding(wxFontEncoding encoding)476 void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
477 {
478 #if wxUSE_WCHAR_T && !wxUSE_UNICODE
479     if (encoding == m_encoding)
480         return;
481 
482     delete m_conv;
483 
484     m_encoding = encoding;
485     if (m_encoding == wxFONTENCODING_SYSTEM)
486         m_conv = NULL;
487     else
488         m_conv = new wxCSConv(wxFontMapper::GetEncodingName(m_encoding));
489 #else
490     (void) encoding;
491 #endif
492 }
493 
Parse(const wxString & input)494 wxString wxHtmlEntitiesParser::Parse(const wxString& input)
495 {
496     const wxChar *c, *last;
497     const wxChar *in_str = input.c_str();
498     wxString output;
499 
500     for (c = in_str, last = in_str; *c != wxT('\0'); c++)
501     {
502         if (*c == wxT('&'))
503         {
504             if ( output.empty() )
505                 output.reserve(input.length());
506 
507             if (c - last > 0)
508                 output.append(last, c - last);
509             if ( *++c == wxT('\0') )
510                 break;
511 
512             wxString entity;
513             const wxChar *ent_s = c;
514             wxChar entity_char;
515 
516             for (; (*c >= wxT('a') && *c <= wxT('z')) ||
517                    (*c >= wxT('A') && *c <= wxT('Z')) ||
518                    (*c >= wxT('0') && *c <= wxT('9')) ||
519                    *c == wxT('_') || *c == wxT('#'); c++) {}
520             entity.append(ent_s, c - ent_s);
521             if (*c != wxT(';')) c--;
522             last = c+1;
523             entity_char = GetEntityChar(entity);
524             if (entity_char)
525                 output << entity_char;
526             else
527             {
528                 output.append(ent_s-1, c-ent_s+2);
529                 wxLogTrace(wxTRACE_HTML_DEBUG,
530                            wxT("Unrecognized HTML entity: '%s'"),
531                            entity.c_str());
532             }
533         }
534     }
535     if (last == in_str) // common case: no entity
536         return input;
537     if (*last != wxT('\0'))
538         output.append(last);
539     return output;
540 }
541 
542 struct wxHtmlEntityInfo
543 {
544     const wxChar *name;
545     unsigned code;
546 };
547 
wxHtmlEntityCompare(const void * key,const void * item)548 extern "C" int LINKAGEMODE wxHtmlEntityCompare(const void *key, const void *item)
549 {
550     return wxStrcmp((wxChar*)key, ((wxHtmlEntityInfo*)item)->name);
551 }
552 
553 #if !wxUSE_UNICODE
GetCharForCode(unsigned code)554 wxChar wxHtmlEntitiesParser::GetCharForCode(unsigned code)
555 {
556 #if wxUSE_WCHAR_T
557     char buf[2];
558     wchar_t wbuf[2];
559     wbuf[0] = (wchar_t)code;
560     wbuf[1] = 0;
561     wxMBConv *conv = m_conv ? m_conv : &wxConvLocal;
562     if (conv->WC2MB(buf, wbuf, 2) == (size_t)-1)
563         return '?';
564     return buf[0];
565 #else
566     return (code < 256) ? (wxChar)code : '?';
567 #endif
568 }
569 #endif
570 
GetEntityChar(const wxString & entity)571 wxChar wxHtmlEntitiesParser::GetEntityChar(const wxString& entity)
572 {
573     unsigned code = 0;
574 
575     if (entity.empty())
576       return 0; // invalid entity reference
577 
578     if (entity[0] == wxT('#'))
579     {
580         const wxChar *ent_s = entity.c_str();
581         const wxChar *format;
582 
583         if (ent_s[1] == wxT('x') || ent_s[1] == wxT('X'))
584         {
585             format = wxT("%x");
586             ent_s++;
587         }
588         else
589             format = wxT("%u");
590         ent_s++;
591 
592         if (wxSscanf(ent_s, format, &code) != 1)
593             code = 0;
594     }
595     else
596     {
597         static wxHtmlEntityInfo substitutions[] = {
598             { wxT("AElig"),198 },
599             { wxT("Aacute"),193 },
600             { wxT("Acirc"),194 },
601             { wxT("Agrave"),192 },
602             { wxT("Alpha"),913 },
603             { wxT("Aring"),197 },
604             { wxT("Atilde"),195 },
605             { wxT("Auml"),196 },
606             { wxT("Beta"),914 },
607             { wxT("Ccedil"),199 },
608             { wxT("Chi"),935 },
609             { wxT("Dagger"),8225 },
610             { wxT("Delta"),916 },
611             { wxT("ETH"),208 },
612             { wxT("Eacute"),201 },
613             { wxT("Ecirc"),202 },
614             { wxT("Egrave"),200 },
615             { wxT("Epsilon"),917 },
616             { wxT("Eta"),919 },
617             { wxT("Euml"),203 },
618             { wxT("Gamma"),915 },
619             { wxT("Iacute"),205 },
620             { wxT("Icirc"),206 },
621             { wxT("Igrave"),204 },
622             { wxT("Iota"),921 },
623             { wxT("Iuml"),207 },
624             { wxT("Kappa"),922 },
625             { wxT("Lambda"),923 },
626             { wxT("Mu"),924 },
627             { wxT("Ntilde"),209 },
628             { wxT("Nu"),925 },
629             { wxT("OElig"),338 },
630             { wxT("Oacute"),211 },
631             { wxT("Ocirc"),212 },
632             { wxT("Ograve"),210 },
633             { wxT("Omega"),937 },
634             { wxT("Omicron"),927 },
635             { wxT("Oslash"),216 },
636             { wxT("Otilde"),213 },
637             { wxT("Ouml"),214 },
638             { wxT("Phi"),934 },
639             { wxT("Pi"),928 },
640             { wxT("Prime"),8243 },
641             { wxT("Psi"),936 },
642             { wxT("Rho"),929 },
643             { wxT("Scaron"),352 },
644             { wxT("Sigma"),931 },
645             { wxT("THORN"),222 },
646             { wxT("Tau"),932 },
647             { wxT("Theta"),920 },
648             { wxT("Uacute"),218 },
649             { wxT("Ucirc"),219 },
650             { wxT("Ugrave"),217 },
651             { wxT("Upsilon"),933 },
652             { wxT("Uuml"),220 },
653             { wxT("Xi"),926 },
654             { wxT("Yacute"),221 },
655             { wxT("Yuml"),376 },
656             { wxT("Zeta"),918 },
657             { wxT("aacute"),225 },
658             { wxT("acirc"),226 },
659             { wxT("acute"),180 },
660             { wxT("aelig"),230 },
661             { wxT("agrave"),224 },
662             { wxT("alefsym"),8501 },
663             { wxT("alpha"),945 },
664             { wxT("amp"),38 },
665             { wxT("and"),8743 },
666             { wxT("ang"),8736 },
667             { wxT("apos"),39 },
668             { wxT("aring"),229 },
669             { wxT("asymp"),8776 },
670             { wxT("atilde"),227 },
671             { wxT("auml"),228 },
672             { wxT("bdquo"),8222 },
673             { wxT("beta"),946 },
674             { wxT("brvbar"),166 },
675             { wxT("bull"),8226 },
676             { wxT("cap"),8745 },
677             { wxT("ccedil"),231 },
678             { wxT("cedil"),184 },
679             { wxT("cent"),162 },
680             { wxT("chi"),967 },
681             { wxT("circ"),710 },
682             { wxT("clubs"),9827 },
683             { wxT("cong"),8773 },
684             { wxT("copy"),169 },
685             { wxT("crarr"),8629 },
686             { wxT("cup"),8746 },
687             { wxT("curren"),164 },
688             { wxT("dArr"),8659 },
689             { wxT("dagger"),8224 },
690             { wxT("darr"),8595 },
691             { wxT("deg"),176 },
692             { wxT("delta"),948 },
693             { wxT("diams"),9830 },
694             { wxT("divide"),247 },
695             { wxT("eacute"),233 },
696             { wxT("ecirc"),234 },
697             { wxT("egrave"),232 },
698             { wxT("empty"),8709 },
699             { wxT("emsp"),8195 },
700             { wxT("ensp"),8194 },
701             { wxT("epsilon"),949 },
702             { wxT("equiv"),8801 },
703             { wxT("eta"),951 },
704             { wxT("eth"),240 },
705             { wxT("euml"),235 },
706             { wxT("euro"),8364 },
707             { wxT("exist"),8707 },
708             { wxT("fnof"),402 },
709             { wxT("forall"),8704 },
710             { wxT("frac12"),189 },
711             { wxT("frac14"),188 },
712             { wxT("frac34"),190 },
713             { wxT("frasl"),8260 },
714             { wxT("gamma"),947 },
715             { wxT("ge"),8805 },
716             { wxT("gt"),62 },
717             { wxT("hArr"),8660 },
718             { wxT("harr"),8596 },
719             { wxT("hearts"),9829 },
720             { wxT("hellip"),8230 },
721             { wxT("iacute"),237 },
722             { wxT("icirc"),238 },
723             { wxT("iexcl"),161 },
724             { wxT("igrave"),236 },
725             { wxT("image"),8465 },
726             { wxT("infin"),8734 },
727             { wxT("int"),8747 },
728             { wxT("iota"),953 },
729             { wxT("iquest"),191 },
730             { wxT("isin"),8712 },
731             { wxT("iuml"),239 },
732             { wxT("kappa"),954 },
733             { wxT("lArr"),8656 },
734             { wxT("lambda"),955 },
735             { wxT("lang"),9001 },
736             { wxT("laquo"),171 },
737             { wxT("larr"),8592 },
738             { wxT("lceil"),8968 },
739             { wxT("ldquo"),8220 },
740             { wxT("le"),8804 },
741             { wxT("lfloor"),8970 },
742             { wxT("lowast"),8727 },
743             { wxT("loz"),9674 },
744             { wxT("lrm"),8206 },
745             { wxT("lsaquo"),8249 },
746             { wxT("lsquo"),8216 },
747             { wxT("lt"),60 },
748             { wxT("macr"),175 },
749             { wxT("mdash"),8212 },
750             { wxT("micro"),181 },
751             { wxT("middot"),183 },
752             { wxT("minus"),8722 },
753             { wxT("mu"),956 },
754             { wxT("nabla"),8711 },
755             { wxT("nbsp"),160 },
756             { wxT("ndash"),8211 },
757             { wxT("ne"),8800 },
758             { wxT("ni"),8715 },
759             { wxT("not"),172 },
760             { wxT("notin"),8713 },
761             { wxT("nsub"),8836 },
762             { wxT("ntilde"),241 },
763             { wxT("nu"),957 },
764             { wxT("oacute"),243 },
765             { wxT("ocirc"),244 },
766             { wxT("oelig"),339 },
767             { wxT("ograve"),242 },
768             { wxT("oline"),8254 },
769             { wxT("omega"),969 },
770             { wxT("omicron"),959 },
771             { wxT("oplus"),8853 },
772             { wxT("or"),8744 },
773             { wxT("ordf"),170 },
774             { wxT("ordm"),186 },
775             { wxT("oslash"),248 },
776             { wxT("otilde"),245 },
777             { wxT("otimes"),8855 },
778             { wxT("ouml"),246 },
779             { wxT("para"),182 },
780             { wxT("part"),8706 },
781             { wxT("permil"),8240 },
782             { wxT("perp"),8869 },
783             { wxT("phi"),966 },
784             { wxT("pi"),960 },
785             { wxT("piv"),982 },
786             { wxT("plusmn"),177 },
787             { wxT("pound"),163 },
788             { wxT("prime"),8242 },
789             { wxT("prod"),8719 },
790             { wxT("prop"),8733 },
791             { wxT("psi"),968 },
792             { wxT("quot"),34 },
793             { wxT("rArr"),8658 },
794             { wxT("radic"),8730 },
795             { wxT("rang"),9002 },
796             { wxT("raquo"),187 },
797             { wxT("rarr"),8594 },
798             { wxT("rceil"),8969 },
799             { wxT("rdquo"),8221 },
800             { wxT("real"),8476 },
801             { wxT("reg"),174 },
802             { wxT("rfloor"),8971 },
803             { wxT("rho"),961 },
804             { wxT("rlm"),8207 },
805             { wxT("rsaquo"),8250 },
806             { wxT("rsquo"),8217 },
807             { wxT("sbquo"),8218 },
808             { wxT("scaron"),353 },
809             { wxT("sdot"),8901 },
810             { wxT("sect"),167 },
811             { wxT("shy"),173 },
812             { wxT("sigma"),963 },
813             { wxT("sigmaf"),962 },
814             { wxT("sim"),8764 },
815             { wxT("spades"),9824 },
816             { wxT("sub"),8834 },
817             { wxT("sube"),8838 },
818             { wxT("sum"),8721 },
819             { wxT("sup"),8835 },
820             { wxT("sup1"),185 },
821             { wxT("sup2"),178 },
822             { wxT("sup3"),179 },
823             { wxT("supe"),8839 },
824             { wxT("szlig"),223 },
825             { wxT("tau"),964 },
826             { wxT("there4"),8756 },
827             { wxT("theta"),952 },
828             { wxT("thetasym"),977 },
829             { wxT("thinsp"),8201 },
830             { wxT("thorn"),254 },
831             { wxT("tilde"),732 },
832             { wxT("times"),215 },
833             { wxT("trade"),8482 },
834             { wxT("uArr"),8657 },
835             { wxT("uacute"),250 },
836             { wxT("uarr"),8593 },
837             { wxT("ucirc"),251 },
838             { wxT("ugrave"),249 },
839             { wxT("uml"),168 },
840             { wxT("upsih"),978 },
841             { wxT("upsilon"),965 },
842             { wxT("uuml"),252 },
843             { wxT("weierp"),8472 },
844             { wxT("xi"),958 },
845             { wxT("yacute"),253 },
846             { wxT("yen"),165 },
847             { wxT("yuml"),255 },
848             { wxT("zeta"),950 },
849             { wxT("zwj"),8205 },
850             { wxT("zwnj"),8204 },
851             {NULL, 0}};
852         static size_t substitutions_cnt = 0;
853 
854         if (substitutions_cnt == 0)
855             while (substitutions[substitutions_cnt].code != 0)
856                 substitutions_cnt++;
857 
858         wxHtmlEntityInfo *info = NULL;
859 #ifdef __WXWINCE__
860         // bsearch crashes under WinCE for some reason
861         size_t i;
862         for (i = 0; i < substitutions_cnt; i++)
863         {
864             if (entity == substitutions[i].name)
865             {
866                 info = & substitutions[i];
867                 break;
868             }
869         }
870 #else
871         info = (wxHtmlEntityInfo*) bsearch(entity.c_str(), substitutions,
872                                            substitutions_cnt,
873                                            sizeof(wxHtmlEntityInfo),
874                                            wxHtmlEntityCompare);
875 #endif
876         if (info)
877             code = info->code;
878     }
879 
880     if (code == 0)
881         return 0;
882     else
883         return GetCharForCode(code);
884 }
885 
OpenURL(wxHtmlURLType WXUNUSED (type),const wxString & url) const886 wxFSFile *wxHtmlParser::OpenURL(wxHtmlURLType WXUNUSED(type),
887                                 const wxString& url) const
888 {
889     return m_FS ? m_FS->OpenFile(url) : NULL;
890 
891 }
892 
893 
894 //-----------------------------------------------------------------------------
895 // wxHtmlParser::ExtractCharsetInformation
896 //-----------------------------------------------------------------------------
897 
898 class wxMetaTagParser : public wxHtmlParser
899 {
900 public:
wxMetaTagParser()901     wxMetaTagParser() { }
902 
GetProduct()903     wxObject* GetProduct() { return NULL; }
904 
905 protected:
AddText(const wxChar * WXUNUSED (txt))906     virtual void AddText(const wxChar* WXUNUSED(txt)) {}
907 
908     DECLARE_NO_COPY_CLASS(wxMetaTagParser)
909 };
910 
911 class wxMetaTagHandler : public wxHtmlTagHandler
912 {
913 public:
wxMetaTagHandler(wxString * retval)914     wxMetaTagHandler(wxString *retval) : wxHtmlTagHandler(), m_retval(retval) {}
GetSupportedTags()915     wxString GetSupportedTags() { return wxT("META,BODY"); }
916     bool HandleTag(const wxHtmlTag& tag);
917 
918 private:
919     wxString *m_retval;
920 
921     DECLARE_NO_COPY_CLASS(wxMetaTagHandler)
922 };
923 
HandleTag(const wxHtmlTag & tag)924 bool wxMetaTagHandler::HandleTag(const wxHtmlTag& tag)
925 {
926     if (tag.GetName() == _T("BODY"))
927     {
928         m_Parser->StopParsing();
929         return false;
930     }
931 
932     if (tag.HasParam(_T("HTTP-EQUIV")) &&
933         tag.GetParam(_T("HTTP-EQUIV")).IsSameAs(_T("Content-Type"), false) &&
934         tag.HasParam(_T("CONTENT")))
935     {
936         wxString content = tag.GetParam(_T("CONTENT")).Lower();
937         if (content.Left(19) == _T("text/html; charset="))
938         {
939             *m_retval = content.Mid(19);
940             m_Parser->StopParsing();
941         }
942     }
943     return false;
944 }
945 
946 
947 /*static*/
ExtractCharsetInformation(const wxString & markup)948 wxString wxHtmlParser::ExtractCharsetInformation(const wxString& markup)
949 {
950     wxString charset;
951     wxMetaTagParser *parser = new wxMetaTagParser();
952     if(parser)
953     {
954         parser->AddTagHandler(new wxMetaTagHandler(&charset));
955         parser->Parse(markup);
956         delete parser;
957     }
958     return charset;
959 }
960 
961 #endif
962