1 /*
2 This file uses the HTML parser in libxml to provide an HTML
3 parser in R that is basically identical to the XML parsing interface.
4 It can handle files, URLs, compressed files, and raw HTML text.
5 It drops the DTD and validation options since these are not very relevant
6 for HTML. (We can add put them back if anyone wants!)
7 */
8
9 #include "DocParse.h"
10 #include "Utils.h"
11
12 #include "libxml/HTMLparser.h"
13 #include "libxml/HTMLtree.h"
14
15 #include <sys/stat.h>
16 #include <unistd.h>
17
18 USER_OBJECT_
RS_XML(HtmlParseTree)19 RS_XML(HtmlParseTree)(USER_OBJECT_ fileName, USER_OBJECT_ converterFunctions,
20 USER_OBJECT_ skipBlankLines, USER_OBJECT_ replaceEntities,
21 USER_OBJECT_ asText, USER_OBJECT_ trim, USER_OBJECT_ isURL)
22 {
23 const char *name;
24 xmlDocPtr doc;
25 USER_OBJECT_ rdoc;
26 USER_OBJECT_ className;
27 R_XMLSettings parserSettings;
28 int freeName = 0;
29
30 int asTextBuffer = LOGICAL_DATA(asText)[0];
31 int isURLDoc = LOGICAL_DATA(isURL)[0];
32
33 parserSettings.skipBlankLines = LOGICAL_DATA(skipBlankLines)[0];
34 parserSettings.converters = converterFunctions;
35 parserSettings.trim = LOGICAL_DATA(trim)[0];
36
37 if(asTextBuffer == 0) {
38 struct stat tmp_stat;
39 #ifdef USE_R
40 name = CHAR(STRING_ELT(fileName, 0));
41 #else
42 name = CHARACTER_DATA(fileName)[0];
43 #endif
44 if(!isURLDoc && (name == NULL || stat(name, &tmp_stat) < 0)) {
45 Rf_error("Can't find file %s", CHAR_DEREF(STRING_ELT(fileName, 0)) );
46 }
47 } else {
48 name = strdup(CHAR_DEREF(STRING_ELT(fileName, 0)));
49 freeName = 1;
50 }
51
52
53 #if 0
54 /* If one wants entities expanded directly and to appear as text. */
55 if(LOGICAL_DATA(replaceEntities)[0])
56 xmlSubstituteEntitiesDefault(1);
57 #endif
58
59 if(asTextBuffer) {
60 doc = htmlParseDoc(CHAR_TO_XMLCHAR(name), NULL);
61 if(doc != NULL) {
62 doc->name = (char *) xmlStrdup(CHAR_TO_XMLCHAR("<buffer>"));
63 }
64 } else {
65 doc = htmlParseFile(name, NULL);
66 }
67
68 if(doc == NULL) {
69 if(freeName && name)
70 free((char *) name);
71 Rf_error("error in creating parser for %s", name);
72 }
73
74 PROTECT(rdoc = RS_XML(convertXMLDoc)(name, doc, converterFunctions, &parserSettings));
75
76 if(freeName && name)
77 free((char *) name);
78
79
80 #if 0
81 xmlFreeDoc(doc);
82 R_numXMLDocsFreed++;
83 #endif
84
85 /* Set the class for the document. */
86 className = NEW_CHARACTER(1);
87 PROTECT(className);
88 SET_STRING_ELT(className, 0, mkChar("HTMLDocument"));
89 SET_CLASS(rdoc, className);
90 UNPROTECT(1);
91
92
93 UNPROTECT(1);
94 return(rdoc);
95 }
96
97
98
99
100 /*
101 Copied from RS_XML_printXMLNode (XMLTree.c) with minor changes.
102 */
103 USER_OBJECT_
RS_XML_dumpHTMLDoc(USER_OBJECT_ r_node,USER_OBJECT_ format,USER_OBJECT_ r_encoding,USER_OBJECT_ indent,USER_OBJECT_ outFile)104 RS_XML_dumpHTMLDoc(USER_OBJECT_ r_node, USER_OBJECT_ format, USER_OBJECT_ r_encoding, USER_OBJECT_ indent, USER_OBJECT_ outFile)
105 {
106 USER_OBJECT_ ans;
107 xmlDocPtr node;
108 const char *encoding = NULL;
109 xmlOutputBufferPtr buf;
110 xmlBufferPtr xbuf;
111
112 int oldIndent;
113
114 oldIndent = xmlIndentTreeOutput;
115
116 node = (xmlDocPtr) R_ExternalPtrAddr(r_node);
117
118 xmlIndentTreeOutput = LOGICAL(indent)[0];
119
120 #if ADD_XML_OUTPUT_BUFFER_CODE
121 if(Rf_length(outFile)) {
122 htmlSaveFile(CHAR_DEREF(STRING_ELT(outFile, 0)), node);
123 return(R_NilValue);
124 }
125 #endif
126
127
128 if(GET_LENGTH(r_encoding))
129 encoding = CHAR_DEREF(STRING_ELT(r_encoding, 0));
130
131 xbuf = xmlBufferCreate();
132
133 #if 1
134 buf = xmlOutputBufferCreateBuffer(xbuf, NULL);
135 #else
136 buf = xmlOutputBufferCreateFilename("/tmp/test.out", NULL, 0);
137 #endif
138
139 htmlDocContentDumpFormatOutput(buf, node, encoding, INTEGER(format)[0]);
140 xmlOutputBufferFlush(buf);
141 xmlIndentTreeOutput = oldIndent;
142
143 if(xbuf->use > 0) {
144 /*XXX this const char * in CHARSXP means we have to make multiple copies. */
145 #if 0
146 char *rbuf = R_alloc(sizeof(char) * (xbuf->use + 1));
147 memcpy(rbuf, xbuf->content, xbuf->use + 1);
148 PROTECT(tmp = mkChar(rbuf));
149 #endif
150 // ans = ScalarString(mkChar(xbuf->content));
151 DECL_ENCODING_FROM_DOC(node)
152 ans = ScalarString(ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(xbuf->content)));
153 } else
154 ans = NEW_CHARACTER(1);
155
156 xmlOutputBufferClose(buf);
157
158 return(ans);
159 }
160