1 /* tidylib.c -- internal library definitions
2 
3   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4   See tidyp.h for the copyright notice.
5 
6   Defines HTML Tidy API implemented by tidy library.
7 
8   Very rough initial cut for discussion purposes.
9 
10   Public interface is const-correct and doesn't explicitly depend
11   on any globals.  Thus, thread-safety may be introduced w/out
12   changing the interface.
13 
14   Looking ahead to a C++ wrapper, C functions always pass
15   this-equivalent as 1st arg.
16 
17   Created 2001-05-20 by Charles Reitzel
18 
19 */
20 
21 #include <errno.h>
22 
23 #include "tidy-int.h"
24 #include "parser.h"
25 #include "clean.h"
26 #include "config.h"
27 #include "message.h"
28 #include "pprint.h"
29 #include "entities.h"
30 #include "tmbstr.h"
31 #include "utf8.h"
32 #include "mappedio.h"
33 
34 #ifdef TIDY_WIN32_MLANG_SUPPORT
35 #include "win32tc.h"
36 #endif
37 
38 /* Create/Destroy a Tidy "document" object */
39 static TidyDocImpl* tidyDocCreate( TidyAllocator *allocator );
40 static void         tidyDocRelease( TidyDocImpl* impl );
41 
42 static int          tidyDocStatus( TidyDocImpl* impl );
43 
44 /* Parse Markup */
45 static int          tidyDocParseFile( TidyDocImpl* impl, ctmbstr htmlfil );
46 static int          tidyDocParseStdin( TidyDocImpl* impl );
47 static int          tidyDocParseString( TidyDocImpl* impl, ctmbstr content );
48 static int          tidyDocParseBuffer( TidyDocImpl* impl, TidyBuffer* inbuf );
49 static int          tidyDocParseSource( TidyDocImpl* impl, TidyInputSource* docIn );
50 
51 
52 /* Execute post-parse diagnostics and cleanup.
53 ** Note, the order is important.  You will get different
54 ** results from the diagnostics depending on if they are run
55 ** pre-or-post repair.
56 */
57 static int          tidyDocRunDiagnostics( TidyDocImpl* doc );
58 static int          tidyDocCleanAndRepair( TidyDocImpl* doc );
59 
60 
61 /* Save cleaned up file to file/buffer/sink */
62 static int          tidyDocSaveFile( TidyDocImpl* impl, ctmbstr htmlfil );
63 static int          tidyDocSaveStdout( TidyDocImpl* impl );
64 static int          tidyDocSaveString( TidyDocImpl* impl, tmbstr buffer, uint* buflen );
65 static int          tidyDocSaveBuffer( TidyDocImpl* impl, TidyBuffer* outbuf );
66 static int          tidyDocSaveSink( TidyDocImpl* impl, TidyOutputSink* docOut );
67 static int          tidyDocSaveStream( TidyDocImpl* impl, StreamOut* out );
68 
69 #ifdef NEVER
tidyDocToImpl(TidyDoc tdoc)70 TidyDocImpl* tidyDocToImpl( TidyDoc tdoc )
71 {
72   return (TidyDocImpl*) tdoc;
73 }
tidyImplToDoc(TidyDocImpl * impl)74 TidyDoc      tidyImplToDoc( TidyDocImpl* impl )
75 {
76   return (TidyDoc) impl;
77 }
78 
tidyNodeToImpl(TidyNode tnod)79 Node*        tidyNodeToImpl( TidyNode tnod )
80 {
81   return (Node*) tnod;
82 }
tidyImplToNode(Node * node)83 TidyNode     tidyImplToNode( Node* node )
84 {
85   return (TidyNode) node;
86 }
87 
tidyAttrToImpl(TidyAttr tattr)88 AttVal*      tidyAttrToImpl( TidyAttr tattr )
89 {
90   return (AttVal*) tattr;
91 }
tidyImplToAttr(AttVal * attval)92 TidyAttr     tidyImplToAttr( AttVal* attval )
93 {
94   return (TidyAttr) attval;
95 }
96 
tidyOptionToImpl(TidyOption topt)97 const TidyOptionImpl* tidyOptionToImpl( TidyOption topt )
98 {
99   return (const TidyOptionImpl*) topt;
100 }
tidyImplToOption(const TidyOptionImpl * option)101 TidyOption   tidyImplToOption( const TidyOptionImpl* option )
102 {
103   return (TidyOption) option;
104 }
105 #endif
106 
107 /* Tidy public interface
108 **
109 ** Most functions return an integer:
110 **
111 ** 0    -> SUCCESS
112 ** >0   -> WARNING
113 ** <0   -> ERROR
114 **
115 */
116 
tidyCreate(void)117 TidyDoc TIDY_CALL       tidyCreate(void)
118 {
119   TidyDocImpl* impl = tidyDocCreate( &TY_(g_default_allocator) );
120   return tidyImplToDoc( impl );
121 }
122 
tidyCreateWithAllocator(TidyAllocator * allocator)123 TidyDoc TIDY_CALL tidyCreateWithAllocator( TidyAllocator *allocator )
124 {
125   TidyDocImpl* impl = tidyDocCreate( allocator );
126   return tidyImplToDoc( impl );
127 }
128 
tidyRelease(TidyDoc tdoc)129 void TIDY_CALL          tidyRelease( TidyDoc tdoc )
130 {
131   TidyDocImpl* impl = tidyDocToImpl( tdoc );
132   tidyDocRelease( impl );
133 }
134 
tidyDocCreate(TidyAllocator * allocator)135 TidyDocImpl* tidyDocCreate( TidyAllocator *allocator )
136 {
137     TidyDocImpl* doc = (TidyDocImpl*)TidyAlloc( allocator, sizeof(TidyDocImpl) );
138     TidyClearMemory( doc, sizeof(*doc) );
139     doc->allocator = allocator;
140 
141     TY_(InitMap)();
142     TY_(InitTags)( doc );
143     TY_(InitAttrs)( doc );
144     TY_(InitConfig)( doc );
145     TY_(InitPrintBuf)( doc );
146 
147     /* By default, wire tidy messages to standard error.
148     ** Document input will be set by parsing routines.
149     ** Document output will be set by pretty print routines.
150     ** Config input will be set by config parsing routines.
151     ** But we need to start off with a way to report errors.
152     */
153     doc->errout = TY_(StdErrOutput)();
154     return doc;
155 }
156 
tidyDocRelease(TidyDocImpl * doc)157 void          tidyDocRelease( TidyDocImpl* doc )
158 {
159     /* doc in/out opened and closed by parse/print routines */
160     if ( doc )
161     {
162         assert( doc->docIn == NULL );
163         assert( doc->docOut == NULL );
164 
165         TY_(ReleaseStreamOut)( doc, doc->errout );
166         doc->errout = NULL;
167 
168         TY_(FreePrintBuf)( doc );
169         TY_(FreeLexer)( doc );
170         TY_(FreeNode)(doc, &doc->root);
171         TidyClearMemory(&doc->root, sizeof(Node));
172 
173         if (doc->givenDoctype)
174             TidyDocFree(doc, doc->givenDoctype);
175 
176         TY_(FreeConfig)( doc );
177         TY_(FreeAttrTable)( doc );
178         TY_(FreeTags)( doc );
179         TidyDocFree( doc, doc );
180     }
181 }
182 
183 /* Let application store a chunk of data w/ each Tidy tdocance.
184 ** Useful for callbacks.
185 */
tidySetAppData(TidyDoc tdoc,void * appData)186 void TIDY_CALL        tidySetAppData( TidyDoc tdoc, void* appData )
187 {
188   TidyDocImpl* impl = tidyDocToImpl( tdoc );
189   if ( impl )
190     impl->appData = appData;
191 }
tidyGetAppData(TidyDoc tdoc)192 void* TIDY_CALL       tidyGetAppData( TidyDoc tdoc )
193 {
194   TidyDocImpl* impl = tidyDocToImpl( tdoc );
195   if ( impl )
196     return impl->appData;
197   return NULL;
198 }
199 
tidyVersion(void)200 ctmbstr TIDY_CALL     tidyVersion(void)
201 {
202     return TY_(Version)();
203 }
204 
205 
206 /* Get/set configuration options
207 */
tidySetOptionCallback(TidyDoc tdoc,TidyOptCallback pOptCallback)208 Bool TIDY_CALL        tidySetOptionCallback( TidyDoc tdoc, TidyOptCallback pOptCallback )
209 {
210   TidyDocImpl* impl = tidyDocToImpl( tdoc );
211   if ( impl )
212   {
213     impl->pOptCallback = pOptCallback;
214     return yes;
215   }
216   return no;
217 }
218 
219 
tidyLoadConfig(TidyDoc tdoc,ctmbstr cfgfil)220 int TIDY_CALL     tidyLoadConfig( TidyDoc tdoc, ctmbstr cfgfil )
221 {
222     TidyDocImpl* impl = tidyDocToImpl( tdoc );
223     if ( impl )
224         return TY_(ParseConfigFile)( impl, cfgfil );
225     return -EINVAL;
226 }
227 
tidyLoadConfigEnc(TidyDoc tdoc,ctmbstr cfgfil,ctmbstr charenc)228 int TIDY_CALL     tidyLoadConfigEnc( TidyDoc tdoc, ctmbstr cfgfil, ctmbstr charenc )
229 {
230     TidyDocImpl* impl = tidyDocToImpl( tdoc );
231     if ( impl )
232         return TY_(ParseConfigFileEnc)( impl, cfgfil, charenc );
233     return -EINVAL;
234 }
235 
tidySetCharEncoding(TidyDoc tdoc,ctmbstr encnam)236 int TIDY_CALL         tidySetCharEncoding( TidyDoc tdoc, ctmbstr encnam )
237 {
238     TidyDocImpl* impl = tidyDocToImpl( tdoc );
239     if ( impl )
240     {
241         int enc = TY_(CharEncodingId)( impl, encnam );
242         if ( enc >= 0 && TY_(AdjustCharEncoding)(impl, enc) )
243             return 0;
244 
245         TY_(ReportBadArgument)( impl, "char-encoding" );
246     }
247     return -EINVAL;
248 }
249 
tidySetInCharEncoding(TidyDoc tdoc,ctmbstr encnam)250 int TIDY_CALL           tidySetInCharEncoding( TidyDoc tdoc, ctmbstr encnam )
251 {
252     TidyDocImpl* impl = tidyDocToImpl( tdoc );
253     if ( impl )
254     {
255         int enc = TY_(CharEncodingId)( impl, encnam );
256         if ( enc >= 0 && TY_(SetOptionInt)( impl, TidyInCharEncoding, enc ) )
257             return 0;
258 
259         TY_(ReportBadArgument)( impl, "in-char-encoding" );
260     }
261     return -EINVAL;
262 }
263 
tidySetOutCharEncoding(TidyDoc tdoc,ctmbstr encnam)264 int TIDY_CALL           tidySetOutCharEncoding( TidyDoc tdoc, ctmbstr encnam )
265 {
266     TidyDocImpl* impl = tidyDocToImpl( tdoc );
267     if ( impl )
268     {
269         int enc = TY_(CharEncodingId)( impl, encnam );
270         if ( enc >= 0 && TY_(SetOptionInt)( impl, TidyOutCharEncoding, enc ) )
271             return 0;
272 
273         TY_(ReportBadArgument)( impl, "out-char-encoding" );
274     }
275     return -EINVAL;
276 }
277 
tidyOptGetIdForName(ctmbstr optnam)278 TidyOptionId TIDY_CALL tidyOptGetIdForName( ctmbstr optnam )
279 {
280     const TidyOptionImpl* option = TY_(lookupOption)( optnam );
281     if ( option )
282         return option->id;
283     return N_TIDY_OPTIONS;  /* Error */
284 }
285 
tidyGetOptionList(TidyDoc tdoc)286 TidyIterator TIDY_CALL  tidyGetOptionList( TidyDoc tdoc )
287 {
288     TidyDocImpl* impl = tidyDocToImpl( tdoc );
289     if ( impl )
290         return TY_(getOptionList)( impl );
291     return (TidyIterator) -1;
292 }
293 
tidyGetNextOption(TidyDoc tdoc,TidyIterator * pos)294 TidyOption TIDY_CALL    tidyGetNextOption( TidyDoc tdoc, TidyIterator* pos )
295 {
296     TidyDocImpl* impl = tidyDocToImpl( tdoc );
297     const TidyOptionImpl* option = NULL;
298     if ( impl )
299         option = TY_(getNextOption)( impl, pos );
300     else if ( pos )
301         *pos = 0;
302     return tidyImplToOption( option );
303 }
304 
305 
tidyGetOption(TidyDoc ARG_UNUSED (tdoc),TidyOptionId optId)306 TidyOption TIDY_CALL    tidyGetOption( TidyDoc ARG_UNUSED(tdoc), TidyOptionId optId )
307 {
308     const TidyOptionImpl* option = TY_(getOption)( optId );
309     return tidyImplToOption( option );
310 }
tidyGetOptionByName(TidyDoc ARG_UNUSED (doc),ctmbstr optnam)311 TidyOption TIDY_CALL    tidyGetOptionByName( TidyDoc ARG_UNUSED(doc), ctmbstr optnam )
312 {
313     const TidyOptionImpl* option = TY_(lookupOption)( optnam );
314     return tidyImplToOption( option );
315 }
316 
tidyOptGetId(TidyOption topt)317 TidyOptionId TIDY_CALL  tidyOptGetId( TidyOption topt )
318 {
319     const TidyOptionImpl* option = tidyOptionToImpl( topt );
320     if ( option )
321         return option->id;
322     return N_TIDY_OPTIONS;
323 }
tidyOptGetName(TidyOption topt)324 ctmbstr TIDY_CALL       tidyOptGetName( TidyOption topt )
325 {
326     const TidyOptionImpl* option = tidyOptionToImpl( topt );
327     if ( option )
328         return option->name;
329     return NULL;
330 }
tidyOptGetType(TidyOption topt)331 TidyOptionType TIDY_CALL tidyOptGetType( TidyOption topt )
332 {
333     const TidyOptionImpl* option = tidyOptionToImpl( topt );
334     if ( option )
335         return option->type;
336     return (TidyOptionType) -1;
337 }
tidyOptGetCategory(TidyOption topt)338 TidyConfigCategory TIDY_CALL tidyOptGetCategory( TidyOption topt )
339 {
340     const TidyOptionImpl* option = tidyOptionToImpl( topt );
341     if ( option )
342         return option->category;
343     return (TidyConfigCategory) -1;
344 }
tidyOptGetDefault(TidyOption topt)345 ctmbstr TIDY_CALL       tidyOptGetDefault( TidyOption topt )
346 {
347     const TidyOptionImpl* option = tidyOptionToImpl( topt );
348     if ( option && option->type == TidyString )
349         return (ctmbstr) option->dflt;
350     return NULL;
351 }
tidyOptGetDefaultInt(TidyOption topt)352 ulong TIDY_CALL          tidyOptGetDefaultInt( TidyOption topt )
353 {
354     const TidyOptionImpl* option = tidyOptionToImpl( topt );
355     if ( option && option->type != TidyString )
356         return option->dflt;
357     return ~0U;
358 }
tidyOptGetDefaultBool(TidyOption topt)359 Bool TIDY_CALL          tidyOptGetDefaultBool( TidyOption topt )
360 {
361     const TidyOptionImpl* option = tidyOptionToImpl( topt );
362     if ( option && option->type != TidyString )
363         return ( option->dflt ? yes : no );
364     return no;
365 }
tidyOptIsReadOnly(TidyOption topt)366 Bool TIDY_CALL          tidyOptIsReadOnly( TidyOption topt )
367 {
368     const TidyOptionImpl* option = tidyOptionToImpl( topt );
369     if ( option  )
370         return ( option->parser == NULL );
371     return yes;
372 }
373 
374 
tidyOptGetPickList(TidyOption topt)375 TidyIterator TIDY_CALL  tidyOptGetPickList( TidyOption topt )
376 {
377     const TidyOptionImpl* option = tidyOptionToImpl( topt );
378     if ( option )
379       return TY_(getOptionPickList)( option );
380     return (TidyIterator) -1;
381 }
tidyOptGetNextPick(TidyOption topt,TidyIterator * pos)382 ctmbstr TIDY_CALL       tidyOptGetNextPick( TidyOption topt, TidyIterator* pos )
383 {
384     const TidyOptionImpl* option = tidyOptionToImpl( topt );
385     if ( option )
386         return TY_(getNextOptionPick)( option, pos );
387     return NULL;
388 }
389 
390 
tidyOptGetValue(TidyDoc tdoc,TidyOptionId optId)391 ctmbstr TIDY_CALL       tidyOptGetValue( TidyDoc tdoc, TidyOptionId optId )
392 {
393   TidyDocImpl* impl = tidyDocToImpl( tdoc );
394   ctmbstr optval = NULL;
395   if ( impl )
396     optval = cfgStr( impl, optId );
397   return optval;
398 }
tidyOptSetValue(TidyDoc tdoc,TidyOptionId optId,ctmbstr val)399 Bool TIDY_CALL        tidyOptSetValue( TidyDoc tdoc, TidyOptionId optId, ctmbstr val )
400 {
401   TidyDocImpl* impl = tidyDocToImpl( tdoc );
402   if ( impl )
403     return TY_(ParseConfigValue)( impl, optId, val );
404   return no;
405 }
tidyOptParseValue(TidyDoc tdoc,ctmbstr optnam,ctmbstr val)406 Bool TIDY_CALL        tidyOptParseValue( TidyDoc tdoc, ctmbstr optnam, ctmbstr val )
407 {
408   TidyDocImpl* impl = tidyDocToImpl( tdoc );
409   if ( impl )
410     return TY_(ParseConfigOption)( impl, optnam, val );
411   return no;
412 }
413 
tidyOptGetInt(TidyDoc tdoc,TidyOptionId optId)414 ulong TIDY_CALL        tidyOptGetInt( TidyDoc tdoc, TidyOptionId optId )
415 {
416     TidyDocImpl* impl = tidyDocToImpl( tdoc );
417     ulong opti = 0;
418     if ( impl )
419         opti = cfg( impl, optId );
420     return opti;
421 }
422 
tidyOptSetInt(TidyDoc tdoc,TidyOptionId optId,ulong val)423 Bool TIDY_CALL        tidyOptSetInt( TidyDoc tdoc, TidyOptionId optId, ulong val )
424 {
425     TidyDocImpl* impl = tidyDocToImpl( tdoc );
426     if ( impl )
427         return TY_(SetOptionInt)( impl, optId, val );
428     return no;
429 }
430 
tidyOptGetBool(TidyDoc tdoc,TidyOptionId optId)431 Bool TIDY_CALL         tidyOptGetBool( TidyDoc tdoc, TidyOptionId optId )
432 {
433     TidyDocImpl* impl = tidyDocToImpl( tdoc );
434     Bool optb = no;
435     if ( impl )
436     {
437         const TidyOptionImpl* option = TY_(getOption)( optId );
438         if ( option )
439         {
440             optb = cfgBool( impl, optId );
441         }
442     }
443     return optb;
444 }
445 
tidyOptSetBool(TidyDoc tdoc,TidyOptionId optId,Bool val)446 Bool TIDY_CALL        tidyOptSetBool( TidyDoc tdoc, TidyOptionId optId, Bool val )
447 {
448     TidyDocImpl* impl = tidyDocToImpl( tdoc );
449     if ( impl )
450         return TY_(SetOptionBool)( impl, optId, val );
451     return no;
452 }
453 
tidyOptGetEncName(TidyDoc tdoc,TidyOptionId optId)454 ctmbstr TIDY_CALL       tidyOptGetEncName( TidyDoc tdoc, TidyOptionId optId )
455 {
456   uint enc = tidyOptGetInt( tdoc, optId );
457   return TY_(CharEncodingOptName)( enc );
458 }
459 
tidyOptGetCurrPick(TidyDoc tdoc,TidyOptionId optId)460 ctmbstr TIDY_CALL       tidyOptGetCurrPick( TidyDoc tdoc, TidyOptionId optId )
461 {
462     const TidyOptionImpl* option = TY_(getOption)( optId );
463     if ( option && option->pickList )
464     {
465         uint ix, pick = tidyOptGetInt( tdoc, optId );
466         const ctmbstr* pL = option->pickList;
467         for ( ix=0; *pL && ix < pick; ++ix )
468             ++pL;
469         if ( *pL )
470             return *pL;
471     }
472     return NULL;
473 }
474 
475 
tidyOptGetDeclTagList(TidyDoc tdoc)476 TidyIterator TIDY_CALL tidyOptGetDeclTagList( TidyDoc tdoc )
477 {
478     TidyDocImpl* impl = tidyDocToImpl( tdoc );
479     TidyIterator declIter = 0;
480     if ( impl )
481         declIter = TY_(GetDeclaredTagList)( impl );
482     return declIter;
483 }
484 
tidyOptGetNextDeclTag(TidyDoc tdoc,TidyOptionId optId,TidyIterator * iter)485 ctmbstr TIDY_CALL       tidyOptGetNextDeclTag( TidyDoc tdoc, TidyOptionId optId,
486                                      TidyIterator* iter )
487 {
488     TidyDocImpl* impl = tidyDocToImpl( tdoc );
489     ctmbstr tagnam = NULL;
490     if ( impl )
491     {
492         UserTagType tagtyp = tagtype_null;
493         if ( optId == TidyInlineTags )
494             tagtyp = tagtype_inline;
495         else if ( optId == TidyBlockTags )
496             tagtyp = tagtype_block;
497         else if ( optId == TidyEmptyTags )
498             tagtyp = tagtype_empty;
499         else if ( optId == TidyPreTags )
500             tagtyp = tagtype_pre;
501         if ( tagtyp != tagtype_null )
502             tagnam = TY_(GetNextDeclaredTag)( impl, tagtyp, iter );
503     }
504     return tagnam;
505 }
506 
tidyOptGetDoc(TidyDoc ARG_UNUSED (tdoc),TidyOption opt)507 ctmbstr TIDY_CALL tidyOptGetDoc( TidyDoc ARG_UNUSED(tdoc), TidyOption opt )
508 {
509     const TidyOptionId optId = tidyOptGetId( opt );
510     const TidyOptionDoc* docDesc = TY_(OptGetDocDesc)( optId );
511     return docDesc ? docDesc->doc : NULL;
512 }
513 
tidyOptGetDocLinksList(TidyDoc ARG_UNUSED (tdoc),TidyOption opt)514 TidyIterator TIDY_CALL tidyOptGetDocLinksList( TidyDoc ARG_UNUSED(tdoc), TidyOption opt )
515 {
516     const TidyOptionId optId = tidyOptGetId( opt );
517     const TidyOptionDoc* docDesc = TY_(OptGetDocDesc)( optId );
518     if (docDesc && docDesc->links)
519         return (TidyIterator)docDesc->links;
520     return (TidyIterator)NULL;
521 }
522 
tidyOptGetNextDocLinks(TidyDoc tdoc,TidyIterator * pos)523 TidyOption TIDY_CALL tidyOptGetNextDocLinks( TidyDoc tdoc, TidyIterator* pos )
524 {
525     const TidyOptionId* curr = (const TidyOptionId *)*pos;
526     TidyOption opt;
527 
528     if (*curr == TidyUnknownOption)
529     {
530         *pos = (TidyIterator)NULL;
531         return (TidyOption)0;
532     }
533     opt = tidyGetOption(tdoc, *curr);
534     curr++;
535     *pos = (*curr == TidyUnknownOption ) ?
536         (TidyIterator)NULL:(TidyIterator)curr;
537     return opt;
538 }
539 
tidyOptSaveFile(TidyDoc tdoc,ctmbstr cfgfil)540 int TIDY_CALL tidyOptSaveFile( TidyDoc tdoc, ctmbstr cfgfil )
541 {
542     TidyDocImpl* impl = tidyDocToImpl( tdoc );
543     if ( impl )
544         return TY_(SaveConfigFile)( impl, cfgfil );
545     return -EINVAL;
546 }
547 
tidyOptSaveSink(TidyDoc tdoc,TidyOutputSink * sink)548 int TIDY_CALL tidyOptSaveSink( TidyDoc tdoc, TidyOutputSink* sink )
549 {
550     TidyDocImpl* impl = tidyDocToImpl( tdoc );
551     if ( impl )
552         return TY_(SaveConfigSink)( impl, sink );
553     return -EINVAL;
554 }
555 
tidyOptSnapshot(TidyDoc tdoc)556 Bool TIDY_CALL tidyOptSnapshot( TidyDoc tdoc )
557 {
558     TidyDocImpl* impl = tidyDocToImpl( tdoc );
559     if ( impl )
560     {
561         TY_(TakeConfigSnapshot)( impl );
562         return yes;
563     }
564     return no;
565 }
tidyOptResetToSnapshot(TidyDoc tdoc)566 Bool TIDY_CALL tidyOptResetToSnapshot( TidyDoc tdoc )
567 {
568     TidyDocImpl* impl = tidyDocToImpl( tdoc );
569     if ( impl )
570     {
571         TY_(ResetConfigToSnapshot)( impl );
572         return yes;
573     }
574     return no;
575 }
tidyOptResetAllToDefault(TidyDoc tdoc)576 Bool TIDY_CALL tidyOptResetAllToDefault( TidyDoc tdoc )
577 {
578     TidyDocImpl* impl = tidyDocToImpl( tdoc );
579     if ( impl )
580     {
581         TY_(ResetConfigToDefault)( impl );
582         return yes;
583     }
584     return no;
585 }
586 
tidyOptResetToDefault(TidyDoc tdoc,TidyOptionId optId)587 Bool TIDY_CALL tidyOptResetToDefault( TidyDoc tdoc, TidyOptionId optId )
588 {
589     TidyDocImpl* impl = tidyDocToImpl( tdoc );
590     if ( impl )
591         return TY_(ResetOptionToDefault)( impl, optId );
592     return no;
593 }
594 
tidyOptDiffThanDefault(TidyDoc tdoc)595 Bool TIDY_CALL tidyOptDiffThanDefault( TidyDoc tdoc )
596 {
597     TidyDocImpl* impl = tidyDocToImpl( tdoc );
598     if ( impl )
599         return TY_(ConfigDiffThanDefault)( impl );
600     return no;
601 }
tidyOptDiffThanSnapshot(TidyDoc tdoc)602 Bool TIDY_CALL          tidyOptDiffThanSnapshot( TidyDoc tdoc )
603 {
604     TidyDocImpl* impl = tidyDocToImpl( tdoc );
605     if ( impl )
606         return TY_(ConfigDiffThanSnapshot)( impl );
607     return no;
608 }
609 
tidyOptCopyConfig(TidyDoc to,TidyDoc from)610 Bool TIDY_CALL tidyOptCopyConfig( TidyDoc to, TidyDoc from )
611 {
612     TidyDocImpl* docTo = tidyDocToImpl( to );
613     TidyDocImpl* docFrom = tidyDocToImpl( from );
614     if ( docTo && docFrom )
615     {
616         TY_(CopyConfig)( docTo, docFrom );
617         return yes;
618     }
619     return no;
620 }
621 
622 
623 /* I/O and Message handling interface
624 **
625 ** By default, Tidy will define, create and use
626 ** tdocances of input and output handlers for
627 ** standard C buffered I/O (i.e. FILE* stdin,
628 ** FILE* stdout and FILE* stderr for content
629 ** input, content output and diagnostic output,
630 ** respectively.  A FILE* cfgFile input handler
631 ** will be used for config files.  Command line
632 ** options will just be set directly.
633 */
634 
635 /* Use TidyReportFilter to filter messages by diagnostic level:
636 ** info, warning, etc.  Just set diagnostic output
637 ** handler to redirect all diagnostics output.  Return true
638 ** to proceed with output, false to cancel.
639 */
tidySetReportFilter(TidyDoc tdoc,TidyReportFilter filt)640 Bool TIDY_CALL        tidySetReportFilter( TidyDoc tdoc, TidyReportFilter filt )
641 {
642     TidyDocImpl* impl = tidyDocToImpl( tdoc );
643     if ( impl )
644     {
645         impl->mssgFilt = filt;
646         return yes;
647     }
648     return no;
649 }
650 
tidySetErrorFile(TidyDoc tdoc,ctmbstr errfilnam)651 FILE* TIDY_CALL   tidySetErrorFile( TidyDoc tdoc, ctmbstr errfilnam )
652 {
653     TidyDocImpl* impl = tidyDocToImpl( tdoc );
654     if ( impl )
655     {
656         FILE* errout = fopen( errfilnam, "wb" );
657         if ( errout )
658         {
659             uint outenc = cfg( impl, TidyOutCharEncoding );
660             uint nl = cfg( impl, TidyNewline );
661             TY_(ReleaseStreamOut)( impl, impl->errout );
662             impl->errout = TY_(FileOutput)( impl, errout, outenc, nl );
663             return errout;
664         }
665         else /* Emit message to current error sink */
666             TY_(FileError)( impl, errfilnam, TidyError );
667     }
668     return NULL;
669 }
670 
tidySetErrorBuffer(TidyDoc tdoc,TidyBuffer * errbuf)671 int TIDY_CALL    tidySetErrorBuffer( TidyDoc tdoc, TidyBuffer* errbuf )
672 {
673     TidyDocImpl* impl = tidyDocToImpl( tdoc );
674     if ( impl )
675     {
676         uint outenc = cfg( impl, TidyOutCharEncoding );
677         uint nl = cfg( impl, TidyNewline );
678         TY_(ReleaseStreamOut)( impl, impl->errout );
679         impl->errout = TY_(BufferOutput)( impl, errbuf, outenc, nl );
680         return ( impl->errout ? 0 : -ENOMEM );
681     }
682     return -EINVAL;
683 }
684 
tidySetErrorSink(TidyDoc tdoc,TidyOutputSink * sink)685 int TIDY_CALL    tidySetErrorSink( TidyDoc tdoc, TidyOutputSink* sink )
686 {
687     TidyDocImpl* impl = tidyDocToImpl( tdoc );
688     if ( impl )
689     {
690         uint outenc = cfg( impl, TidyOutCharEncoding );
691         uint nl = cfg( impl, TidyNewline );
692         TY_(ReleaseStreamOut)( impl, impl->errout );
693         impl->errout = TY_(UserOutput)( impl, sink, outenc, nl );
694         return ( impl->errout ? 0 : -ENOMEM );
695     }
696     return -EINVAL;
697 }
698 
699 
700 /* Document info */
tidyStatus(TidyDoc tdoc)701 int TIDY_CALL        tidyStatus( TidyDoc tdoc )
702 {
703     TidyDocImpl* impl = tidyDocToImpl( tdoc );
704     int tidyStat = -EINVAL;
705     if ( impl )
706         tidyStat = tidyDocStatus( impl );
707     return tidyStat;
708 }
tidyDetectedHtmlVersion(TidyDoc ARG_UNUSED (tdoc))709 int TIDY_CALL        tidyDetectedHtmlVersion( TidyDoc ARG_UNUSED(tdoc) )
710 {
711 /*    TidyDocImpl* impl = tidyDocToImpl( tdoc ); */
712     return 0;
713 }
tidyDetectedXhtml(TidyDoc ARG_UNUSED (tdoc))714 Bool TIDY_CALL        tidyDetectedXhtml( TidyDoc ARG_UNUSED(tdoc) )
715 {
716 /*    TidyDocImpl* impl = tidyDocToImpl( tdoc ); */
717     return no;
718 }
tidyDetectedGenericXml(TidyDoc ARG_UNUSED (tdoc))719 Bool TIDY_CALL        tidyDetectedGenericXml( TidyDoc ARG_UNUSED(tdoc) )
720 {
721 /*    TidyDocImpl* impl = tidyDocToImpl( tdoc ); */
722     return no;
723 }
724 
tidyErrorCount(TidyDoc tdoc)725 uint TIDY_CALL       tidyErrorCount( TidyDoc tdoc )
726 {
727     TidyDocImpl* impl = tidyDocToImpl( tdoc );
728     uint count = 0xFFFFFFFF;
729     if ( impl )
730         count = impl->errors;
731     return count;
732 }
tidyWarningCount(TidyDoc tdoc)733 uint TIDY_CALL       tidyWarningCount( TidyDoc tdoc )
734 {
735     TidyDocImpl* impl = tidyDocToImpl( tdoc );
736     uint count = 0xFFFFFFFF;
737     if ( impl )
738         count = impl->warnings;
739     return count;
740 }
tidyAccessWarningCount(TidyDoc tdoc)741 uint TIDY_CALL       tidyAccessWarningCount( TidyDoc tdoc )
742 {
743     TidyDocImpl* impl = tidyDocToImpl( tdoc );
744     uint count = 0xFFFFFFFF;
745     if ( impl )
746         count = impl->accessErrors;
747     return count;
748 }
tidyConfigErrorCount(TidyDoc tdoc)749 uint TIDY_CALL       tidyConfigErrorCount( TidyDoc tdoc )
750 {
751     TidyDocImpl* impl = tidyDocToImpl( tdoc );
752     uint count = 0xFFFFFFFF;
753     if ( impl )
754         count = impl->optionErrors;
755     return count;
756 }
757 
758 
759 /* Error reporting functions
760 */
tidyErrorSummary(TidyDoc tdoc)761 void TIDY_CALL         tidyErrorSummary( TidyDoc tdoc )
762 {
763     TidyDocImpl* impl = tidyDocToImpl( tdoc );
764     if ( impl )
765         TY_(ErrorSummary)( impl );
766 }
tidyGeneralInfo(TidyDoc tdoc)767 void TIDY_CALL         tidyGeneralInfo( TidyDoc tdoc )
768 {
769     TidyDocImpl* impl = tidyDocToImpl( tdoc );
770     if ( impl )
771         TY_(GeneralInfo)( impl );
772 }
773 
774 
775 /* I/O Functions
776 **
777 ** Initial version supports only whole-file operations.
778 ** Do not expose Tidy StreamIn or Out data structures - yet.
779 */
780 
781 /* Parse/load Functions
782 **
783 ** HTML/XHTML version determined from input.
784 */
tidyParseFile(TidyDoc tdoc,ctmbstr filnam)785 int TIDY_CALL  tidyParseFile( TidyDoc tdoc, ctmbstr filnam )
786 {
787     TidyDocImpl* doc = tidyDocToImpl( tdoc );
788     return tidyDocParseFile( doc, filnam );
789 }
tidyParseStdin(TidyDoc tdoc)790 int TIDY_CALL  tidyParseStdin( TidyDoc tdoc )
791 {
792     TidyDocImpl* doc = tidyDocToImpl( tdoc );
793     return tidyDocParseStdin( doc );
794 }
tidyParseString(TidyDoc tdoc,ctmbstr content)795 int TIDY_CALL  tidyParseString( TidyDoc tdoc, ctmbstr content )
796 {
797     TidyDocImpl* doc = tidyDocToImpl( tdoc );
798     return tidyDocParseString( doc, content );
799 }
tidyParseBuffer(TidyDoc tdoc,TidyBuffer * inbuf)800 int TIDY_CALL  tidyParseBuffer( TidyDoc tdoc, TidyBuffer* inbuf )
801 {
802     TidyDocImpl* doc = tidyDocToImpl( tdoc );
803     return tidyDocParseBuffer( doc, inbuf );
804 }
tidyParseSource(TidyDoc tdoc,TidyInputSource * source)805 int TIDY_CALL  tidyParseSource( TidyDoc tdoc, TidyInputSource* source )
806 {
807     TidyDocImpl* doc = tidyDocToImpl( tdoc );
808     return tidyDocParseSource( doc, source );
809 }
810 
811 
tidyDocParseFile(TidyDocImpl * doc,ctmbstr filnam)812 int   tidyDocParseFile( TidyDocImpl* doc, ctmbstr filnam )
813 {
814 #ifdef _WIN32
815     return TY_(DocParseFileWithMappedFile)( doc, filnam );
816 #else
817     int status = -ENOENT;
818     FILE* fin = fopen( filnam, "rb" );
819 
820 #if PRESERVE_FILE_TIMES
821     struct stat sbuf;
822 
823     memset( &sbuf, 0, sizeof(sbuf) );
824     /* get last modified time */
825     TidyClearMemory( &doc->filetimes, sizeof(doc->filetimes) );
826     if ( fin && cfgBool(doc,TidyKeepFileTimes) &&
827          fstat(fileno(fin), &sbuf) != -1 )
828     {
829           doc->filetimes.actime  = sbuf.st_atime;
830           doc->filetimes.modtime = sbuf.st_mtime;
831     }
832 #endif
833 
834     if ( fin )
835     {
836         StreamIn* in = TY_(FileInput)( doc, fin, cfg( doc, TidyInCharEncoding ));
837         if ( !in )
838         {
839             fclose( fin );
840             return status;
841         }
842         status = TY_(DocParseStream)( doc, in );
843         TY_(freeFileSource)(&in->source, yes);
844         TY_(freeStreamIn)(in);
845     }
846     else /* Error message! */
847         TY_(FileError)( doc, filnam, TidyError );
848     return status;
849 #endif
850 }
851 
tidyDocParseStdin(TidyDocImpl * doc)852 int   tidyDocParseStdin( TidyDocImpl* doc )
853 {
854     StreamIn* in = TY_(FileInput)( doc, stdin, cfg( doc, TidyInCharEncoding ));
855     const int status = TY_(DocParseStream)( doc, in );
856     TY_(freeStreamIn)(in);
857     return status;
858 }
859 
tidyDocParseBuffer(TidyDocImpl * doc,TidyBuffer * inbuf)860 int   tidyDocParseBuffer( TidyDocImpl* doc, TidyBuffer* inbuf )
861 {
862     int status = -EINVAL;
863     if ( inbuf )
864     {
865         StreamIn* in = TY_(BufferInput)( doc, inbuf, cfg( doc, TidyInCharEncoding ));
866         status = TY_(DocParseStream)( doc, in );
867         TY_(freeStreamIn)(in);
868     }
869     return status;
870 }
871 
tidyDocParseString(TidyDocImpl * doc,ctmbstr content)872 int   tidyDocParseString( TidyDocImpl* doc, ctmbstr content )
873 {
874     int status = -EINVAL;
875 
876     if ( content )
877     {
878         TidyBuffer inbuf;
879         StreamIn* in = NULL;
880         tidyBufInitWithAllocator( &inbuf, doc->allocator );
881         tidyBufAttach( &inbuf, (byte*)content, TY_(tmbstrlen)(content)+1 );
882         in = TY_(BufferInput)( doc, &inbuf, cfg( doc, TidyInCharEncoding ));
883         status = TY_(DocParseStream)( doc, in );
884         tidyBufDetach( &inbuf );
885         TY_(freeStreamIn)(in);
886     }
887     return status;
888 }
889 
tidyDocParseSource(TidyDocImpl * doc,TidyInputSource * source)890 int   tidyDocParseSource( TidyDocImpl* doc, TidyInputSource* source )
891 {
892     StreamIn* const in = TY_(UserInput)( doc, source, cfg( doc, TidyInCharEncoding ));
893     const int status = TY_(DocParseStream)( doc, in );
894     TY_(freeStreamIn)(in);
895     return status;
896 }
897 
898 
899 /* Print/save Functions
900 **
901 */
tidySaveFile(TidyDoc tdoc,ctmbstr filnam)902 int TIDY_CALL        tidySaveFile( TidyDoc tdoc, ctmbstr filnam )
903 {
904     TidyDocImpl* doc = tidyDocToImpl( tdoc );
905     return tidyDocSaveFile( doc, filnam );
906 }
tidySaveStdout(TidyDoc tdoc)907 int TIDY_CALL        tidySaveStdout( TidyDoc tdoc )
908 {
909     TidyDocImpl* doc = tidyDocToImpl( tdoc );
910     return tidyDocSaveStdout( doc );
911 }
tidySaveString(TidyDoc tdoc,tmbstr buffer,uint * buflen)912 int TIDY_CALL        tidySaveString( TidyDoc tdoc, tmbstr buffer, uint* buflen )
913 {
914     TidyDocImpl* doc = tidyDocToImpl( tdoc );
915     return tidyDocSaveString( doc, buffer, buflen );
916 }
tidySaveBuffer(TidyDoc tdoc,TidyBuffer * outbuf)917 int TIDY_CALL        tidySaveBuffer( TidyDoc tdoc, TidyBuffer* outbuf )
918 {
919     TidyDocImpl* doc = tidyDocToImpl( tdoc );
920     return tidyDocSaveBuffer( doc, outbuf );
921 }
tidySaveSink(TidyDoc tdoc,TidyOutputSink * sink)922 int TIDY_CALL        tidySaveSink( TidyDoc tdoc, TidyOutputSink* sink )
923 {
924     TidyDocImpl* doc = tidyDocToImpl( tdoc );
925     return tidyDocSaveSink( doc, sink );
926 }
927 
tidyDocSaveFile(TidyDocImpl * doc,ctmbstr filnam)928 int         tidyDocSaveFile( TidyDocImpl* doc, ctmbstr filnam )
929 {
930     int status = -ENOENT;
931     FILE* fout = NULL;
932 
933     /* Don't zap input file if no output */
934     if ( doc->errors > 0 &&
935          cfgBool(doc, TidyWriteBack) && !cfgBool(doc, TidyForceOutput) )
936         status = tidyDocStatus( doc );
937     else
938         fout = fopen( filnam, "wb" );
939 
940     if ( fout )
941     {
942         uint outenc = cfg( doc, TidyOutCharEncoding );
943         uint nl = cfg( doc, TidyNewline );
944         StreamOut* out = TY_(FileOutput)( doc, fout, outenc, nl );
945 
946         status = tidyDocSaveStream( doc, out );
947 
948         fclose( fout );
949         TidyDocFree( doc, out );
950 
951 #if PRESERVE_FILE_TIMES
952         if ( doc->filetimes.actime )
953         {
954             /* set file last accessed/modified times to original values */
955             utime( filnam, &doc->filetimes );
956             TidyClearMemory( &doc->filetimes, sizeof(doc->filetimes) );
957         }
958 #endif /* PRESERVFILETIMES */
959     }
960     if ( status < 0 ) /* Error message! */
961         TY_(FileError)( doc, filnam, TidyError );
962     return status;
963 }
964 
965 
966 
967 /* Note, _setmode() does NOT work on Win2K Pro w/ VC++ 6.0 SP3.
968 ** The code has been left in in case it works w/ other compilers
969 ** or operating systems.  If stdout is in Text mode, be aware that
970 ** it will garble UTF16 documents.  In text mode, when it encounters
971 ** a single byte of value 10 (0xA), it will insert a single byte
972 ** value 13 (0xD) just before it.  This has the effect of garbling
973 ** the entire document.
974 */
975 
976 #if !defined(NO_SETMODE_SUPPORT)
977 
978 #if defined(_WIN32) || defined(OS2_OS)
979 #include <fcntl.h>
980 #include <io.h>
981 #endif
982 
983 #endif
984 
tidyDocSaveStdout(TidyDocImpl * doc)985 int         tidyDocSaveStdout( TidyDocImpl* doc )
986 {
987 #if !defined(NO_SETMODE_SUPPORT)
988 
989 #if defined(_WIN32) || defined(OS2_OS)
990     int oldstdoutmode = -1, oldstderrmode = -1;
991 #endif
992 
993 #endif
994     int status = 0;
995     uint outenc = cfg( doc, TidyOutCharEncoding );
996     uint nl = cfg( doc, TidyNewline );
997     StreamOut* out = TY_(FileOutput)( doc, stdout, outenc, nl );
998 
999 #if !defined(NO_SETMODE_SUPPORT)
1000 
1001 #if defined(_WIN32) || defined(OS2_OS)
1002     oldstdoutmode = setmode( fileno(stdout), _O_BINARY );
1003     oldstderrmode = setmode( fileno(stderr), _O_BINARY );
1004 #endif
1005 
1006 #endif
1007 
1008     if ( 0 == status )
1009       status = tidyDocSaveStream( doc, out );
1010 
1011     fflush(stdout);
1012     fflush(stderr);
1013 
1014 #if !defined(NO_SETMODE_SUPPORT)
1015 
1016 #if defined(_WIN32) || defined(OS2_OS)
1017     if ( oldstdoutmode != -1 )
1018         oldstdoutmode = setmode( fileno(stdout), oldstdoutmode );
1019     if ( oldstderrmode != -1 )
1020         oldstderrmode = setmode( fileno(stderr), oldstderrmode );
1021 #endif
1022 
1023 #endif
1024 
1025     TidyDocFree( doc, out );
1026     return status;
1027 }
1028 
tidyDocSaveString(TidyDocImpl * doc,tmbstr buffer,uint * buflen)1029 int         tidyDocSaveString( TidyDocImpl* doc, tmbstr buffer, uint* buflen )
1030 {
1031     uint outenc = cfg( doc, TidyOutCharEncoding );
1032     uint nl = cfg( doc, TidyNewline );
1033     TidyBuffer outbuf;
1034     StreamOut* out;
1035     int status;
1036 
1037     tidyBufInitWithAllocator( &outbuf, doc->allocator );
1038     out = TY_(BufferOutput)( doc, &outbuf, outenc, nl );
1039     status = tidyDocSaveStream( doc, out );
1040 
1041     if ( outbuf.size > *buflen )
1042         status = -ENOMEM;
1043     else
1044         memcpy( buffer, outbuf.bp, outbuf.size );
1045 
1046     *buflen = outbuf.size;
1047     tidyBufFree( &outbuf );
1048     TidyDocFree( doc, out );
1049     return status;
1050 }
1051 
tidyDocSaveBuffer(TidyDocImpl * doc,TidyBuffer * outbuf)1052 int         tidyDocSaveBuffer( TidyDocImpl* doc, TidyBuffer* outbuf )
1053 {
1054     int status = -EINVAL;
1055     if ( outbuf )
1056     {
1057         uint outenc = cfg( doc, TidyOutCharEncoding );
1058         uint nl = cfg( doc, TidyNewline );
1059         StreamOut* out = TY_(BufferOutput)( doc, outbuf, outenc, nl );
1060 
1061         status = tidyDocSaveStream( doc, out );
1062         TidyDocFree( doc, out );
1063     }
1064     return status;
1065 }
1066 
tidyDocSaveSink(TidyDocImpl * doc,TidyOutputSink * sink)1067 int         tidyDocSaveSink( TidyDocImpl* doc, TidyOutputSink* sink )
1068 {
1069     uint outenc = cfg( doc, TidyOutCharEncoding );
1070     uint nl = cfg( doc, TidyNewline );
1071     StreamOut* out = TY_(UserOutput)( doc, sink, outenc, nl );
1072     int status = tidyDocSaveStream( doc, out );
1073     TidyDocFree( doc, out );
1074     return status;
1075 }
1076 
tidyDocStatus(TidyDocImpl * doc)1077 int         tidyDocStatus( TidyDocImpl* doc )
1078 {
1079     if ( doc->errors > 0 )
1080         return 2;
1081     if ( doc->warnings > 0 || doc->accessErrors > 0 )
1082         return 1;
1083     return 0;
1084 }
1085 
1086 
1087 
tidyCleanAndRepair(TidyDoc tdoc)1088 int TIDY_CALL        tidyCleanAndRepair( TidyDoc tdoc )
1089 {
1090     TidyDocImpl* impl = tidyDocToImpl( tdoc );
1091     if ( impl )
1092       return tidyDocCleanAndRepair( impl );
1093     return -EINVAL;
1094 }
1095 
tidyRunDiagnostics(TidyDoc tdoc)1096 int TIDY_CALL        tidyRunDiagnostics( TidyDoc tdoc )
1097 {
1098     TidyDocImpl* impl = tidyDocToImpl( tdoc );
1099     if ( impl )
1100       return tidyDocRunDiagnostics( impl );
1101     return -EINVAL;
1102 }
1103 
1104 
1105 /* Workhorse functions.
1106 **
1107 ** Parse requires input source, all input config items
1108 ** and diagnostic sink to have all been set before calling.
1109 **
1110 ** Emit likewise requires that document sink and all
1111 ** pretty printing options have been set.
1112 */
1113 static ctmbstr integrity = "\nPanic - tree has lost its integrity\n";
1114 
TY_(DocParseStream)1115 int         TY_(DocParseStream)( TidyDocImpl* doc, StreamIn* in )
1116 {
1117     Bool xmlIn = cfgBool( doc, TidyXmlTags );
1118     int bomEnc;
1119 
1120     assert( doc != NULL && in != NULL );
1121     assert( doc->docIn == NULL );
1122     doc->docIn = in;
1123 
1124     TY_(TakeConfigSnapshot)( doc );    /* Save config state */
1125     TY_(FreeLexer)( doc );
1126     TY_(FreeAnchors)( doc );
1127 
1128     TY_(FreeNode)(doc, &doc->root);
1129     TidyClearMemory(&doc->root, sizeof(Node));
1130 
1131     if (doc->givenDoctype)
1132         TidyDocFree(doc, doc->givenDoctype);
1133 
1134     doc->givenDoctype = NULL;
1135 
1136     doc->lexer = TY_(NewLexer)( doc );
1137     /* doc->lexer->root = &doc->root; */
1138     doc->root.line = doc->lexer->lines;
1139     doc->root.column = doc->lexer->columns;
1140     doc->inputHadBOM = no;
1141 
1142     bomEnc = TY_(ReadBOMEncoding)(in);
1143 
1144     if (bomEnc != -1)
1145     {
1146         in->encoding = bomEnc;
1147         TY_(SetOptionInt)(doc, TidyInCharEncoding, bomEnc);
1148     }
1149 
1150 #ifdef TIDY_WIN32_MLANG_SUPPORT
1151     if (in->encoding > WIN32MLANG)
1152         TY_(Win32MLangInitInputTranscoder)(in, in->encoding);
1153 #endif /* TIDY_WIN32_MLANG_SUPPORT */
1154 
1155     /* Tidy doesn't alter the doctype for generic XML docs */
1156     if ( xmlIn )
1157     {
1158         TY_(ParseXMLDocument)( doc );
1159         if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
1160             TidyPanic( doc->allocator, integrity );
1161     }
1162     else
1163     {
1164         doc->warnings = 0;
1165         TY_(ParseDocument)( doc );
1166         if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
1167             TidyPanic( doc->allocator, integrity );
1168     }
1169 
1170 #ifdef TIDY_WIN32_MLANG_SUPPORT
1171     TY_(Win32MLangUninitInputTranscoder)(in);
1172 #endif /* TIDY_WIN32_MLANG_SUPPORT */
1173 
1174     doc->docIn = NULL;
1175     return tidyDocStatus( doc );
1176 }
1177 
tidyDocRunDiagnostics(TidyDocImpl * doc)1178 int         tidyDocRunDiagnostics( TidyDocImpl* doc )
1179 {
1180     Bool quiet = cfgBool( doc, TidyQuiet );
1181     Bool force = cfgBool( doc, TidyForceOutput );
1182 
1183     if ( !quiet )
1184     {
1185 
1186         TY_(ReportMarkupVersion)( doc );
1187         TY_(ReportNumWarnings)( doc );
1188     }
1189 
1190     if ( doc->errors > 0 && !force )
1191         TY_(NeedsAuthorIntervention)( doc );
1192 
1193      return tidyDocStatus( doc );
1194 }
1195 
tidyDocCleanAndRepair(TidyDocImpl * doc)1196 int         tidyDocCleanAndRepair( TidyDocImpl* doc )
1197 {
1198     Bool word2K   = cfgBool( doc, TidyWord2000 );
1199     Bool logical  = cfgBool( doc, TidyLogicalEmphasis );
1200     Bool clean    = cfgBool( doc, TidyMakeClean );
1201     Bool dropFont = cfgBool( doc, TidyDropFontTags );
1202     Bool htmlOut  = cfgBool( doc, TidyHtmlOut );
1203     Bool xmlOut   = cfgBool( doc, TidyXmlOut );
1204     Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
1205     Bool xmlDecl  = cfgBool( doc, TidyXmlDecl );
1206     Bool tidyMark = cfgBool( doc, TidyMark );
1207     Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
1208     Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
1209     Node* node;
1210 
1211     if (tidyXmlTags)
1212        return tidyDocStatus( doc );
1213 
1214     /* simplifies <b><b> ... </b> ...</b> etc. */
1215     TY_(NestedEmphasis)( doc, &doc->root );
1216 
1217     /* cleans up <dir>indented text</dir> etc. */
1218     TY_(List2BQ)( doc, &doc->root );
1219     TY_(BQ2Div)( doc, &doc->root );
1220 
1221     /* replaces i by em and b by strong */
1222     if ( logical )
1223         TY_(EmFromI)( doc, &doc->root );
1224 
1225     if ( word2K && TY_(IsWord2000)(doc) )
1226     {
1227         /* prune Word2000's <![if ...]> ... <![endif]> */
1228         TY_(DropSections)( doc, &doc->root );
1229 
1230         /* drop style & class attributes and empty p, span elements */
1231         TY_(CleanWord2000)( doc, &doc->root );
1232         TY_(DropEmptyElements)(doc, &doc->root);
1233     }
1234 
1235     /* replaces presentational markup by style rules */
1236     if ( clean || dropFont )
1237         TY_(CleanDocument)( doc );
1238 
1239     /*  Move terminating <br /> tags from out of paragraphs  */
1240     /*!  Do we want to do this for all block-level elements?  */
1241 
1242     /* This is disabled due to http://tidy.sf.net/bug/681116 */
1243 
1244     /*  Reconcile http-equiv meta element with output encoding  */
1245     if (cfg( doc, TidyOutCharEncoding) != RAW
1246 #ifndef NO_NATIVE_ISO2022_SUPPORT
1247         && cfg( doc, TidyOutCharEncoding) != ISO2022
1248 #endif
1249         )
1250         TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc ));
1251 
1252     if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
1253         TidyPanic( doc->allocator, integrity );
1254 
1255     /* remember given doctype for reporting */
1256     node = TY_(FindDocType)(doc);
1257     if (node)
1258     {
1259         AttVal* fpi = TY_(GetAttrByName)(node, "PUBLIC");
1260         if (AttrHasValue(fpi))
1261         {
1262             if (doc->givenDoctype)
1263                 TidyDocFree(doc, doc->givenDoctype);
1264             doc->givenDoctype = TY_(tmbstrdup)(doc->allocator,fpi->value);
1265         }
1266     }
1267 
1268     if ( doc->root.content )
1269     {
1270         /* If we had XHTML input but want HTML output */
1271         if ( htmlOut && doc->lexer->isvoyager )
1272         {
1273             Node* node = TY_(FindDocType)(doc);
1274             /* Remove reference, but do not free */
1275             if (node)
1276               TY_(RemoveNode)(node);
1277         }
1278 
1279         if (xhtmlOut && !htmlOut)
1280         {
1281             TY_(SetXHTMLDocType)(doc);
1282             TY_(FixAnchors)(doc, &doc->root, wantNameAttr, yes);
1283             TY_(FixXhtmlNamespace)(doc, yes);
1284             TY_(FixLanguageInformation)(doc, &doc->root, yes, yes);
1285         }
1286         else
1287         {
1288             TY_(FixDocType)(doc);
1289             TY_(FixAnchors)(doc, &doc->root, wantNameAttr, yes);
1290             TY_(FixXhtmlNamespace)(doc, no);
1291             TY_(FixLanguageInformation)(doc, &doc->root, no, yes);
1292         }
1293 
1294         if (tidyMark )
1295             TY_(AddGenerator)(doc);
1296     }
1297 
1298     /* ensure presence of initial <?xml version="1.0"?> */
1299     if ( xmlOut && xmlDecl )
1300         TY_(FixXmlDecl)( doc );
1301 
1302     return tidyDocStatus( doc );
1303 }
1304 
1305 static
showBodyOnly(TidyDocImpl * doc,TidyTriState bodyOnly)1306 Bool showBodyOnly( TidyDocImpl* doc, TidyTriState bodyOnly )
1307 {
1308     Node* node;
1309 
1310     switch( bodyOnly )
1311     {
1312     case TidyNoState:
1313         return no;
1314     case TidyYesState:
1315         return yes;
1316     default:
1317         node = TY_(FindBody)( doc );
1318         if (node && node->implicit )
1319             return yes;
1320     }
1321     return no;
1322 }
1323 
1324 
tidyDocSaveStream(TidyDocImpl * doc,StreamOut * out)1325 int         tidyDocSaveStream( TidyDocImpl* doc, StreamOut* out )
1326 {
1327     Bool showMarkup  = cfgBool( doc, TidyShowMarkup );
1328     Bool forceOutput = cfgBool( doc, TidyForceOutput );
1329 #if SUPPORT_UTF16_ENCODINGS
1330     Bool outputBOM   = ( cfgAutoBool(doc, TidyOutputBOM) == TidyYesState );
1331     Bool smartBOM    = ( cfgAutoBool(doc, TidyOutputBOM) == TidyAutoState );
1332 #endif
1333     Bool xmlOut      = cfgBool( doc, TidyXmlOut );
1334     Bool xhtmlOut    = cfgBool( doc, TidyXhtmlOut );
1335     TidyTriState bodyOnly    = cfgAutoBool( doc, TidyBodyOnly );
1336 
1337     Bool dropComments = cfgBool(doc, TidyHideComments);
1338     Bool makeClean    = cfgBool(doc, TidyMakeClean);
1339     Bool asciiChars   = cfgBool(doc, TidyAsciiChars);
1340     Bool makeBare     = cfgBool(doc, TidyMakeBare);
1341     Bool escapeCDATA  = cfgBool(doc, TidyEscapeCdata);
1342     TidyAttrSortStrategy sortAttrStrat = cfg(doc, TidySortAttributes);
1343 
1344     if (escapeCDATA)
1345         TY_(ConvertCDATANodes)(doc, &doc->root);
1346 
1347     if (dropComments)
1348         TY_(DropComments)(doc, &doc->root);
1349 
1350     if (makeClean)
1351     {
1352         /* noop */
1353         TY_(DropFontElements)(doc, &doc->root, NULL);
1354         TY_(WbrToSpace)(doc, &doc->root);
1355     }
1356 
1357     if ((makeClean && asciiChars) || makeBare)
1358         TY_(DowngradeTypography)(doc, &doc->root);
1359 
1360     if (makeBare)
1361         /* Note: no longer replaces &nbsp; in */
1362         /* attribute values / non-text tokens */
1363         TY_(NormalizeSpaces)(doc->lexer, &doc->root);
1364     else
1365         TY_(ReplacePreformattedSpaces)(doc, &doc->root);
1366 
1367     if ( sortAttrStrat != TidySortAttrNone )
1368         TY_(SortAttributes)(&doc->root, sortAttrStrat);
1369 
1370     if ( showMarkup && (doc->errors == 0 || forceOutput) )
1371     {
1372 #if SUPPORT_UTF16_ENCODINGS
1373         /* Output a Byte Order Mark if required */
1374         if ( outputBOM || (doc->inputHadBOM && smartBOM) )
1375             TY_(outBOM)( out );
1376 #endif
1377 
1378         /* No longer necessary. No DOCTYPE == HTML 3.2,
1379         ** which gives you only the basic character entities,
1380         ** which are safe in any browser.
1381         ** if ( !TY_(FindDocType)(doc) )
1382         **    TY_(SetOptionBool)( doc, TidyNumEntities, yes );
1383         */
1384 
1385         doc->docOut = out;
1386         if ( xmlOut && !xhtmlOut )
1387             TY_(PPrintXMLTree)( doc, NORMAL, 0, &doc->root );
1388         else if ( showBodyOnly( doc, bodyOnly ) )
1389             TY_(PrintBody)( doc );
1390         else
1391             TY_(PPrintTree)( doc, NORMAL, 0, &doc->root );
1392 
1393         TY_(PFlushLine)( doc, 0 );
1394         doc->docOut = NULL;
1395     }
1396 
1397     TY_(ResetConfigToSnapshot)( doc );
1398     return tidyDocStatus( doc );
1399 }
1400 
1401 /* Tree traversal functions
1402 **
1403 ** The big issue here is the degree to which we should mimic
1404 ** a DOM and/or SAX nodes.
1405 **
1406 ** Is it 100% possible (and, if so, how difficult is it) to
1407 ** emit SAX events from this API?  If SAX events are possible,
1408 ** is that 100% of data needed to build a DOM?
1409 */
1410 
tidyGetRoot(TidyDoc tdoc)1411 TidyNode TIDY_CALL   tidyGetRoot( TidyDoc tdoc )
1412 {
1413     TidyDocImpl* impl = tidyDocToImpl( tdoc );
1414     Node* node = NULL;
1415     if ( impl )
1416         node = &impl->root;
1417     return tidyImplToNode( node );
1418 }
1419 
tidyGetHtml(TidyDoc tdoc)1420 TidyNode TIDY_CALL   tidyGetHtml( TidyDoc tdoc )
1421 {
1422   TidyDocImpl* impl = tidyDocToImpl( tdoc );
1423   Node* node = NULL;
1424   if ( impl )
1425       node = TY_(FindHTML)( impl );
1426   return tidyImplToNode( node );
1427 }
1428 
tidyGetHead(TidyDoc tdoc)1429 TidyNode TIDY_CALL    tidyGetHead( TidyDoc tdoc )
1430 {
1431   TidyDocImpl* impl = tidyDocToImpl( tdoc );
1432   Node* node = NULL;
1433   if ( impl )
1434       node = TY_(FindHEAD)( impl );
1435   return tidyImplToNode( node );
1436 }
1437 
tidyGetBody(TidyDoc tdoc)1438 TidyNode TIDY_CALL    tidyGetBody( TidyDoc tdoc )
1439 {
1440   TidyDocImpl* impl = tidyDocToImpl( tdoc );
1441   Node* node = NULL;
1442   if ( impl )
1443       node = TY_(FindBody)( impl );
1444   return tidyImplToNode( node );
1445 }
1446 
1447 /* parent / child */
tidyGetParent(TidyNode tnod)1448 TidyNode TIDY_CALL    tidyGetParent( TidyNode tnod )
1449 {
1450   Node* nimp = tidyNodeToImpl( tnod );
1451   return tidyImplToNode( nimp->parent );
1452 }
tidyGetChild(TidyNode tnod)1453 TidyNode TIDY_CALL    tidyGetChild( TidyNode tnod )
1454 {
1455   Node* nimp = tidyNodeToImpl( tnod );
1456   return tidyImplToNode( nimp->content );
1457 }
1458 
1459 /* siblings */
tidyGetNext(TidyNode tnod)1460 TidyNode TIDY_CALL    tidyGetNext( TidyNode tnod )
1461 {
1462   Node* nimp = tidyNodeToImpl( tnod );
1463   return tidyImplToNode( nimp->next );
1464 }
tidyGetPrev(TidyNode tnod)1465 TidyNode TIDY_CALL    tidyGetPrev( TidyNode tnod )
1466 {
1467   Node* nimp = tidyNodeToImpl( tnod );
1468   return tidyImplToNode( nimp->prev );
1469 }
1470 
1471 /* Node info */
tidyNodeGetType(TidyNode tnod)1472 TidyNodeType TIDY_CALL tidyNodeGetType( TidyNode tnod )
1473 {
1474   Node* nimp = tidyNodeToImpl( tnod );
1475   TidyNodeType ntyp = TidyNode_Root;
1476   if ( nimp )
1477     ntyp = (TidyNodeType) nimp->type;
1478   return ntyp;
1479 }
1480 
tidyNodeLine(TidyNode tnod)1481 uint TIDY_CALL tidyNodeLine( TidyNode tnod )
1482 {
1483   Node* nimp = tidyNodeToImpl( tnod );
1484   uint line = 0;
1485   if ( nimp )
1486     line = nimp->line;
1487   return line;
1488 }
tidyNodeColumn(TidyNode tnod)1489 uint TIDY_CALL tidyNodeColumn( TidyNode tnod )
1490 {
1491   Node* nimp = tidyNodeToImpl( tnod );
1492   uint col = 0;
1493   if ( nimp )
1494     col = nimp->column;
1495   return col;
1496 }
1497 
tidyNodeGetName(TidyNode tnod)1498 ctmbstr TIDY_CALL        tidyNodeGetName( TidyNode tnod )
1499 {
1500   Node* nimp = tidyNodeToImpl( tnod );
1501   ctmbstr nnam = NULL;
1502   if ( nimp )
1503     nnam = nimp->element;
1504   return nnam;
1505 }
1506 
1507 
tidyNodeHasText(TidyDoc tdoc,TidyNode tnod)1508 Bool TIDY_CALL  tidyNodeHasText( TidyDoc tdoc, TidyNode tnod )
1509 {
1510   TidyDocImpl* doc = tidyDocToImpl( tdoc );
1511   if ( doc )
1512       return TY_(nodeHasText)( doc, tidyNodeToImpl(tnod) );
1513   return no;
1514 }
1515 
1516 
tidyNodeGetText(TidyDoc tdoc,TidyNode tnod,TidyBuffer * outbuf)1517 Bool TIDY_CALL  tidyNodeGetText( TidyDoc tdoc, TidyNode tnod, TidyBuffer* outbuf )
1518 {
1519   TidyDocImpl* doc = tidyDocToImpl( tdoc );
1520   Node* nimp = tidyNodeToImpl( tnod );
1521   if ( doc && nimp && outbuf )
1522   {
1523       uint outenc     = cfg( doc, TidyOutCharEncoding );
1524       uint nl         = cfg( doc, TidyNewline );
1525       StreamOut* out  = TY_(BufferOutput)( doc, outbuf, outenc, nl );
1526       Bool xmlOut     = cfgBool( doc, TidyXmlOut );
1527       Bool xhtmlOut   = cfgBool( doc, TidyXhtmlOut );
1528 
1529       doc->docOut = out;
1530       if ( xmlOut && !xhtmlOut )
1531           TY_(PPrintXMLTree)( doc, NORMAL, 0, nimp );
1532       else
1533           TY_(PPrintTree)( doc, NORMAL, 0, nimp );
1534 
1535       TY_(PFlushLine)( doc, 0 );
1536       doc->docOut = NULL;
1537 
1538       TidyDocFree( doc, out );
1539       return yes;
1540   }
1541   return no;
1542 }
1543 
tidyNodeGetValue(TidyDoc tdoc,TidyNode tnod,TidyBuffer * buf)1544 Bool TIDY_CALL tidyNodeGetValue( TidyDoc tdoc, TidyNode tnod, TidyBuffer* buf )
1545 {
1546     TidyDocImpl *doc = tidyDocToImpl( tdoc );
1547     Node *node = tidyNodeToImpl( tnod );
1548     if ( doc == NULL || node == NULL || buf == NULL )
1549         return no;
1550 
1551     switch( node->type ) {
1552     case TextNode:
1553     case CDATATag:
1554     case CommentTag:
1555     case ProcInsTag:
1556     case SectionTag:
1557     case AspTag:
1558     case JsteTag:
1559     case PhpTag:
1560     {
1561         tidyBufClear( buf );
1562         tidyBufAppend( buf, doc->lexer->lexbuf + node->start,
1563                        node->end - node->start );
1564         break;
1565     }
1566     default:
1567         /* The node doesn't have a value */
1568         return no;
1569     }
1570 
1571     return yes;
1572 }
1573 
tidyNodeIsProp(TidyDoc ARG_UNUSED (tdoc),TidyNode tnod)1574 Bool TIDY_CALL tidyNodeIsProp( TidyDoc ARG_UNUSED(tdoc), TidyNode tnod )
1575 {
1576   Node* nimp = tidyNodeToImpl( tnod );
1577   Bool isProprietary = yes;
1578   if ( nimp )
1579   {
1580     switch ( nimp->type )
1581     {
1582     case RootNode:
1583     case DocTypeTag:
1584     case CommentTag:
1585     case XmlDecl:
1586     case ProcInsTag:
1587     case TextNode:
1588     case CDATATag:
1589         isProprietary = no;
1590         break;
1591 
1592     case SectionTag:
1593     case AspTag:
1594     case JsteTag:
1595     case PhpTag:
1596         isProprietary = yes;
1597         break;
1598 
1599     case StartTag:
1600     case EndTag:
1601     case StartEndTag:
1602         isProprietary = ( nimp->tag
1603                           ? (nimp->tag->versions&VERS_PROPRIETARY)!=0
1604                           : yes );
1605         break;
1606     }
1607   }
1608   return isProprietary;
1609 }
1610 
tidyNodeGetId(TidyNode tnod)1611 TidyTagId TIDY_CALL tidyNodeGetId(TidyNode tnod)
1612 {
1613     Node* nimp = tidyNodeToImpl(tnod);
1614 
1615     TidyTagId tagId = TidyTag_UNKNOWN;
1616     if (nimp && nimp->tag)
1617         tagId = nimp->tag->id;
1618 
1619     return tagId;
1620 }
1621 
1622 
1623 /* Null for non-element nodes and all pure HTML
1624 cmbstr       tidyNodeNsLocal( TidyNode tnod )
1625 {
1626 }
1627 cmbstr       tidyNodeNsPrefix( TidyNode tnod )
1628 {
1629 }
1630 cmbstr       tidyNodeNsUri( TidyNode tnod )
1631 {
1632 }
1633 */
1634 
1635 /* Iterate over attribute values */
tidyAttrFirst(TidyNode tnod)1636 TidyAttr TIDY_CALL   tidyAttrFirst( TidyNode tnod )
1637 {
1638   Node* nimp = tidyNodeToImpl( tnod );
1639   AttVal* attval = NULL;
1640   if ( nimp )
1641     attval = nimp->attributes;
1642   return tidyImplToAttr( attval );
1643 }
tidyAttrNext(TidyAttr tattr)1644 TidyAttr TIDY_CALL    tidyAttrNext( TidyAttr tattr )
1645 {
1646   AttVal* attval = tidyAttrToImpl( tattr );
1647   AttVal* nxtval = NULL;
1648   if ( attval )
1649     nxtval = attval->next;
1650   return tidyImplToAttr( nxtval );
1651 }
1652 
tidyAttrName(TidyAttr tattr)1653 ctmbstr TIDY_CALL       tidyAttrName( TidyAttr tattr )
1654 {
1655   AttVal* attval = tidyAttrToImpl( tattr );
1656   ctmbstr anam = NULL;
1657   if ( attval )
1658     anam = attval->attribute;
1659   return anam;
1660 }
tidyAttrValue(TidyAttr tattr)1661 ctmbstr TIDY_CALL       tidyAttrValue( TidyAttr tattr )
1662 {
1663   AttVal* attval = tidyAttrToImpl( tattr );
1664   ctmbstr aval = NULL;
1665   if ( attval )
1666     aval = attval->value;
1667   return aval;
1668 }
1669 
1670 /* Null for pure HTML
1671 ctmbstr       tidyAttrNsLocal( TidyAttr tattr )
1672 {
1673 }
1674 ctmbstr       tidyAttrNsPrefix( TidyAttr tattr )
1675 {
1676 }
1677 ctmbstr       tidyAttrNsUri( TidyAttr tattr )
1678 {
1679 }
1680 */
1681 
tidyAttrGetId(TidyAttr tattr)1682 TidyAttrId TIDY_CALL tidyAttrGetId( TidyAttr tattr )
1683 {
1684   AttVal* attval = tidyAttrToImpl( tattr );
1685   TidyAttrId attrId = TidyAttr_UNKNOWN;
1686   if ( attval && attval->dict )
1687     attrId = attval->dict->id;
1688   return attrId;
1689 }
tidyAttrIsProp(TidyAttr tattr)1690 Bool TIDY_CALL tidyAttrIsProp( TidyAttr tattr )
1691 {
1692   AttVal* attval = tidyAttrToImpl( tattr );
1693   Bool isProprietary = yes;
1694   if ( attval )
1695     isProprietary = ( attval->dict
1696                       ? (attval->dict->versions & VERS_PROPRIETARY) != 0
1697                       : yes );
1698   return isProprietary;
1699 }
1700 
1701 /*
1702  * local variables:
1703  * mode: c
1704  * indent-tabs-mode: nil
1705  * c-basic-offset: 4
1706  * eval: (c-set-offset 'substatement-open 0)
1707  * end:
1708  */
1709