1 /*
2   This file uses the HTML parser in libxml to provide an HTML
3   parser in R that is basically identical to the XML parsing interface.
4   It can handle files, URLs, compressed files, and raw HTML text.
5   It drops the DTD and validation options since these are not very relevant
6   for HTML. (We can add put them back if anyone wants!)
7  */
8 
9 #include "DocParse.h"
10 #include "Utils.h"
11 
12 #include "libxml/HTMLparser.h"
13 #include "libxml/HTMLtree.h"
14 
15 #include <sys/stat.h>
16 #include <unistd.h>
17 
18 USER_OBJECT_
RS_XML(HtmlParseTree)19 RS_XML(HtmlParseTree)(USER_OBJECT_ fileName, USER_OBJECT_ converterFunctions,
20                        USER_OBJECT_ skipBlankLines, USER_OBJECT_ replaceEntities,
21                        USER_OBJECT_ asText, USER_OBJECT_ trim, USER_OBJECT_ isURL)
22 {
23   const char *name;
24   xmlDocPtr doc;
25   USER_OBJECT_ rdoc;
26   USER_OBJECT_ className;
27   R_XMLSettings parserSettings;
28   int freeName = 0;
29 
30   int asTextBuffer = LOGICAL_DATA(asText)[0];
31   int isURLDoc = LOGICAL_DATA(isURL)[0];
32 
33   parserSettings.skipBlankLines = LOGICAL_DATA(skipBlankLines)[0];
34   parserSettings.converters = converterFunctions;
35   parserSettings.trim = LOGICAL_DATA(trim)[0];
36 
37   if(asTextBuffer == 0) {
38     struct stat tmp_stat;
39 #ifdef USE_R
40     name = CHAR(STRING_ELT(fileName, 0));
41 #else
42     name = CHARACTER_DATA(fileName)[0];
43 #endif
44     if(!isURLDoc && (name == NULL || stat(name, &tmp_stat) < 0)) {
45 	Rf_error("Can't find file %s", CHAR_DEREF(STRING_ELT(fileName, 0)) );
46     }
47   } else {
48      name = strdup(CHAR_DEREF(STRING_ELT(fileName, 0)));
49      freeName = 1;
50   }
51 
52 
53 #if 0
54     /* If one wants entities expanded directly and to appear as text.  */
55   if(LOGICAL_DATA(replaceEntities)[0])
56     xmlSubstituteEntitiesDefault(1);
57 #endif
58 
59   if(asTextBuffer) {
60    doc = htmlParseDoc(CHAR_TO_XMLCHAR(name), NULL);
61    if(doc != NULL) {
62       doc->name = (char *) xmlStrdup(CHAR_TO_XMLCHAR("<buffer>"));
63    }
64   } else {
65       doc = htmlParseFile(name, NULL);
66   }
67 
68   if(doc == NULL) {
69     if(freeName && name)
70         free((char *) name);
71     Rf_error("error in creating parser for %s", name);
72   }
73 
74   PROTECT(rdoc = RS_XML(convertXMLDoc)(name, doc, converterFunctions, &parserSettings));
75 
76   if(freeName && name)
77       free((char *) name);
78 
79 
80 #if 0
81   xmlFreeDoc(doc);
82   R_numXMLDocsFreed++;
83 #endif
84 
85      /* Set the class for the document. */
86   className = NEW_CHARACTER(1);
87   PROTECT(className);
88     SET_STRING_ELT(className, 0, mkChar("HTMLDocument"));
89     SET_CLASS(rdoc, className);
90   UNPROTECT(1);
91 
92 
93  UNPROTECT(1);
94  return(rdoc);
95 }
96 
97 
98 
99 
100 /*
101   Copied from  RS_XML_printXMLNode (XMLTree.c)  with minor changes.
102  */
103 USER_OBJECT_
RS_XML_dumpHTMLDoc(USER_OBJECT_ r_node,USER_OBJECT_ format,USER_OBJECT_ r_encoding,USER_OBJECT_ indent,USER_OBJECT_ outFile)104 RS_XML_dumpHTMLDoc(USER_OBJECT_ r_node, USER_OBJECT_ format, USER_OBJECT_ r_encoding, USER_OBJECT_ indent, USER_OBJECT_ outFile)
105 {
106     USER_OBJECT_ ans;
107     xmlDocPtr node;
108     const char *encoding = NULL;
109     xmlOutputBufferPtr buf;
110     xmlBufferPtr xbuf;
111 
112     int oldIndent;
113 
114     oldIndent = xmlIndentTreeOutput;
115 
116     node = (xmlDocPtr) R_ExternalPtrAddr(r_node);
117 
118     xmlIndentTreeOutput =  LOGICAL(indent)[0];
119 
120 #if ADD_XML_OUTPUT_BUFFER_CODE
121     if(Rf_length(outFile)) {
122        htmlSaveFile(CHAR_DEREF(STRING_ELT(outFile, 0)), node);
123        return(R_NilValue);
124     }
125 #endif
126 
127 
128     if(GET_LENGTH(r_encoding))
129 	encoding = CHAR_DEREF(STRING_ELT(r_encoding, 0));
130 
131     xbuf = xmlBufferCreate();
132 
133 #if 1
134     buf = xmlOutputBufferCreateBuffer(xbuf, NULL);
135 #else
136     buf = xmlOutputBufferCreateFilename("/tmp/test.out", NULL, 0);
137 #endif
138 
139     htmlDocContentDumpFormatOutput(buf, node, encoding, INTEGER(format)[0]);
140     xmlOutputBufferFlush(buf);
141     xmlIndentTreeOutput = oldIndent;
142 
143     if(xbuf->use > 0) {
144         /*XXX this const char * in CHARSXP means we have to make multiple copies. */
145 #if 0
146 	char *rbuf = R_alloc(sizeof(char) * (xbuf->use + 1));
147 	memcpy(rbuf, xbuf->content, xbuf->use + 1);
148 	PROTECT(tmp = mkChar(rbuf));
149 #endif
150 	// ans = ScalarString(mkChar(xbuf->content));
151 	DECL_ENCODING_FROM_DOC(node)
152 	ans = ScalarString(ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(xbuf->content)));
153     } else
154       ans = NEW_CHARACTER(1);
155 
156     xmlOutputBufferClose(buf);
157 
158     return(ans);
159 }
160