1 /**
2 Routines for parsing and processing an XML document
3 into an R/S data structure.
4
5
6 * See Copyright for the license status of this software.
7
8 */
9
10 #include "DocParse.h"
11
12 #define R_USE_XML_ENCODING 1
13 #include "Utils.h" /* For isBlank() */
14
15
16 /* For the call to stat. */
17 #include <sys/stat.h>
18 #include <unistd.h>
19
20 #include "RSDTD.h"
21
22 #include <stdarg.h>
23
24
25 #include <libxml/xmlschemas.h>
26 #include <libxml/xinclude.h>
27
28
29
30 int RS_XML(setNodeClass)(xmlNodePtr node, USER_OBJECT_ ans);
31 USER_OBJECT_ RS_XML(notifyNamespaceDefinition)(USER_OBJECT_ ns, R_XMLSettings *parserSettings);
32
33
34 void RS_XML(ValidationWarning)(void *ctx, const char *msg, ...);
35 void RS_XML(ValidationError)(void *ctx, const char *msg, ...);
36
37
38 static USER_OBJECT_ convertNode(USER_OBJECT_ ans, xmlNodePtr node, R_XMLSettings *parserSettings);
39 static void NodeTraverse(xmlNodePtr doc, USER_OBJECT_ converterFunctions, R_XMLSettings *parserSettings, int rootFirst);
40
41
42 static USER_OBJECT_ makeSchemaReference(xmlSchemaPtr ref);
43
44
45
46 USER_OBJECT_
RS_XML(libxmlVersionRuntime)47 RS_XML(libxmlVersionRuntime)()
48 {
49 return(mkString(*__xmlParserVersion()));
50 }
51
52
53 USER_OBJECT_
RS_XML(getDefaultValiditySetting)54 RS_XML(getDefaultValiditySetting)(USER_OBJECT_ val)
55 {
56 #ifdef HAVE_VALIDITY
57
58 extern int xmlDoValidityCheckingDefaultValue;
59 USER_OBJECT_ ans;
60 ans = NEW_INTEGER(1);
61 INTEGER_DATA(ans)[0] = xmlDoValidityCheckingDefaultValue;
62
63 if(GET_LENGTH(val))
64 xmlDoValidityCheckingDefaultValue = INTEGER_DATA(val)[0];
65 return(ans);
66
67 #else
68
69 return(NEW_INTEGER(0));
70
71 #endif
72 }
73
74 #include <libxml/parser.h>
75 void
R_xmlStructuredErrorHandler(void * data,xmlErrorPtr err)76 R_xmlStructuredErrorHandler(void *data, xmlErrorPtr err)
77 {
78 RSXML_structuredStop((SEXP) data, err);
79 }
80
81 /**
82 Entry point for reading, parsing and converting an XML tree
83 to an R object.
84
85 fileName is the string identifying the file, and is
86 expanded using the normal rules for an R file name.
87 That is, it can contain environment variables, ~, etc.
88
89 converterFunctions is a collection of functions used to
90 map a node into an R object. This would normally
91 be a closure. It is not currently used, but will be enabled in
92 the future.
93
94 skipBlankLines controls whether text elements consisting
95 simply of white space are included in the resulting
96 structure.
97
98
99 The return value is a simple list with named elements
100 file, version and children
101 The children element is itself a list consisting of
102 objects of class `XMLNode'. Each of these has the characteristic
103
104 */
105 USER_OBJECT_
RS_XML(ParseTree)106 RS_XML(ParseTree)(USER_OBJECT_ fileName, USER_OBJECT_ converterFunctions,
107 USER_OBJECT_ skipBlankLines, USER_OBJECT_ replaceEntities,
108 USER_OBJECT_ asText, USER_OBJECT_ trim, USER_OBJECT_ validate,
109 USER_OBJECT_ getDTD, USER_OBJECT_ isURL,
110 USER_OBJECT_ addNamespaceAttributes,
111 USER_OBJECT_ internalNodeReferences,
112 USER_OBJECT_ s_useHTML, USER_OBJECT_ isSchema,
113 USER_OBJECT_ fullNamespaceInfo, USER_OBJECT_ r_encoding,
114 USER_OBJECT_ useDotNames,
115 USER_OBJECT_ xinclude, USER_OBJECT_ errorFun,
116 USER_OBJECT_ manageMemory, USER_OBJECT_ r_parserOptions,
117 USER_OBJECT_ r_rootFirst)
118 {
119
120 const char *name;
121 xmlDocPtr doc;
122 USER_OBJECT_ rdoc, rdocObj; /* rdocObj is used to put the doc object
123 * under R's garbage collection.*/
124 USER_OBJECT_ className;
125 R_XMLSettings parserSettings;
126
127 int asTextBuffer = LOGICAL_DATA(asText)[0];
128 int isURLDoc = LOGICAL_DATA(isURL)[0];
129 int useHTML = LOGICAL_DATA(s_useHTML)[0];
130
131 const char *encoding = NULL;
132 int freeName = 0;
133 int parserOptions = 0;
134 int rootFirst = INTEGER(r_rootFirst)[0];
135
136 if(GET_LENGTH(r_encoding)) {
137 encoding = CHAR(STRING_ELT(r_encoding, 0));
138 if(!encoding[0])
139 encoding = NULL;
140 }
141
142 if(Rf_length(r_parserOptions))
143 parserOptions = INTEGER(r_parserOptions)[0];
144
145 parserSettings.skipBlankLines = LOGICAL_DATA(skipBlankLines)[0];
146 parserSettings.converters = converterFunctions;
147 parserSettings.useDotNames = LOGICAL_DATA(useDotNames)[0];
148 parserSettings.trim = LOGICAL_DATA(trim)[0];
149 parserSettings.xinclude = LOGICAL_DATA(xinclude)[0];
150 parserSettings.fullNamespaceInfo = LOGICAL_DATA(fullNamespaceInfo)[0];
151
152 parserSettings.internalNodeReferences = LOGICAL_DATA(internalNodeReferences)[0];
153
154 parserSettings.addAttributeNamespaces = LOGICAL_DATA(addNamespaceAttributes)[0];
155 parserSettings.finalize = manageMemory;
156
157 if(asTextBuffer == 0) {
158 struct stat tmp_stat;
159 #ifdef USE_R
160 name = CHAR(STRING_ELT(fileName, 0));
161 #else
162 name = CHARACTER_DATA(fileName)[0];
163 #endif
164 if(!isURLDoc && (name == NULL || stat(name, &tmp_stat) < 0)) {
165 Rf_error("Can't find file %s", CHAR_DEREF(STRING_ELT(fileName, 0)) );
166 }
167 } else {
168 name = strdup(CHAR_DEREF(STRING_ELT(fileName, 0)));
169 freeName = 1;
170 }
171
172 #if 0 /* Done in R now.*/
173 /* If one wants entities expanded directly and to appear as text. */
174 if(LOGICAL_DATA(replaceEntities)[0])
175 xmlSubstituteEntitiesDefault(1);
176 #endif
177
178
179 if(LOGICAL_DATA(isSchema)[0]) {
180 xmlSchemaPtr schema = NULL;
181 xmlSchemaParserCtxtPtr ctxt;
182
183 ctxt = xmlSchemaNewParserCtxt(name);
184 schema = xmlSchemaParse(ctxt);
185 xmlSchemaFreeParserCtxt(ctxt);
186
187
188 /*XXX make certain to cleanup the settings.
189 Put a finalizer on this in makeSchemaReference.
190 */
191
192 return(makeSchemaReference(schema));
193 }
194
195 #ifdef RS_XML_SET_STRUCTURED_ERROR
196 xmlSetStructuredErrorFunc(errorFun == NULL_USER_OBJECT ? NULL : errorFun, R_xmlStructuredErrorHandler);
197 #endif
198
199 if(asTextBuffer) {
200 doc = useHTML ? htmlParseDoc(CHAR_TO_XMLCHAR(name), encoding) :
201 xmlReadMemory(name, (int)strlen(name), NULL, encoding, parserOptions) ;
202 /* xmlParseMemory(name, strlen(name)) */
203
204 if(doc != NULL)
205 doc->name = (char *) xmlStrdup(CHAR_TO_XMLCHAR("<buffer>"));
206
207 } else {
208 doc = useHTML ? htmlParseFile(XMLCHAR_TO_CHAR(name), encoding) :
209 xmlReadFile(name, encoding, parserOptions) /* xmlParseFile(name) */ ;
210 }
211
212 #ifdef RS_XML_SET_STRUCTURED_ERROR
213 xmlSetStructuredErrorFunc(NULL, NULL);
214 #endif
215
216 if(doc == NULL) {
217 if(freeName && name) {
218 #ifdef EXPERIMENTING
219 free((char *) name);
220 #endif
221 }
222 /*XXX Just freed the name ! */
223 if(errorFun != NULL_USER_OBJECT) {
224 RSXML_structuredStop(errorFun, NULL);
225 } else
226 return(stop("XMLParseError", "error in creating parser for %s", name));
227
228 Rf_error("error in creating parser for %s", name);
229 }
230
231 if(TYPEOF(xinclude) == LGLSXP && LOGICAL_DATA(xinclude)[0]) {
232 xmlXIncludeProcessFlags(doc, XML_PARSE_XINCLUDE);
233 } else if(TYPEOF(xinclude) == INTSXP && GET_LENGTH(xinclude) > 0) {
234 xmlXIncludeProcessFlags(doc, INTEGER(xinclude)[0]);
235 }
236
237 if(!useHTML && LOGICAL_DATA(validate)[0]) {
238 xmlValidCtxt ctxt;
239 ctxt.error = RS_XML(ValidationError);
240 ctxt.warning = RS_XML(ValidationWarning);
241
242 if(!xmlValidateDocument(&ctxt, doc)) {
243 if(freeName && name)
244 free((char *) name);
245
246
247 Rf_error("XML document is invalid");
248 }
249 }
250
251 if(parserSettings.internalNodeReferences) {
252 /* Use a different approach - pass internal nodes to the converter functions*/
253 if(GET_LENGTH(converterFunctions) > 0) {
254 xmlNodePtr root;
255 #ifdef USE_OLD_ROOT_CHILD_NAMES
256 root = doc->root;
257 #else
258 root = doc->xmlRootNode;
259 #ifdef ROOT_HAS_DTD_NODE
260 if(root->next && root->children == NULL)
261 root = root->next;
262 #endif
263 #endif
264 PROTECT(rdocObj = R_createXMLDocRef(doc));
265 NodeTraverse(root, converterFunctions, &parserSettings, rootFirst);
266 UNPROTECT(1);
267 }
268 PROTECT(rdoc = NULL_USER_OBJECT);
269 } else {
270 PROTECT(rdoc = RS_XML(convertXMLDoc)(name, doc, converterFunctions, &parserSettings));
271 }
272
273 if(asTextBuffer && name)
274 free((char *) name);
275
276
277 if(!useHTML && !parserSettings.internalNodeReferences && LOGICAL_DATA(getDTD)[0]) {
278 USER_OBJECT_ ans, klass, tmp;
279 const char *names[] = {"doc", "dtd"};
280 PROTECT(ans = NEW_LIST(2));
281 SET_VECTOR_ELT(ans, 0, rdoc);
282 SET_VECTOR_ELT(ans, 1, tmp = RS_XML(ConstructDTDList)(doc, 1, NULL));
283
284 PROTECT(klass = NEW_CHARACTER(1));
285 SET_STRING_ELT( klass, 0, mkChar("DTDList"));
286 SET_CLASS(tmp, klass);
287
288 RS_XML(SetNames)(sizeof(names)/sizeof(names[0]), names, ans);
289
290 UNPROTECT(2); /* release the ans */
291 rdoc = ans;
292 }
293
294 if(parserSettings.internalNodeReferences && GET_LENGTH(converterFunctions) < 1) {
295 UNPROTECT(1);
296 return(R_createXMLDocRef(doc));
297 }
298
299
300 if(!parserSettings.internalNodeReferences) {
301 /* Set the class for the document. */
302 className = NEW_CHARACTER(1);
303 PROTECT(className);
304 SET_STRING_ELT(className, 0, mkChar(useHTML ? "HTMLDocument" : "XMLDocument"));
305 SET_CLASS(rdoc, className);
306 UNPROTECT(1);
307 }
308
309
310 UNPROTECT(1);
311 return(rdoc);
312 }
313
314 enum { FILE_ELEMENT_NAME, VERSION_ELEMENT_NAME, CHILDREN_ELEMENT_NAME, NUM_DOC_ELEMENTS};
315
316
317
318 void
NodeTraverse(xmlNodePtr root,USER_OBJECT_ converterFunctions,R_XMLSettings * parserSettings,int rootFirst)319 NodeTraverse(xmlNodePtr root, USER_OBJECT_ converterFunctions, R_XMLSettings *parserSettings, int rootFirst)
320 {
321 xmlNodePtr c, tmp;
322 c = root;
323
324 while(c) {
325 USER_OBJECT_ ref;
326 #ifndef USE_OLD_ROOT_CHILD_NAMES
327 tmp = c->xmlChildrenNode;
328 #else
329 c->childs;
330 #endif
331
332 if(!rootFirst && tmp)
333 NodeTraverse(tmp, converterFunctions, parserSettings, rootFirst);
334
335 PROTECT(ref = R_createXMLNodeRef(c, parserSettings->finalize));
336 convertNode(ref, c, parserSettings);
337 UNPROTECT(1);
338
339 if(rootFirst && tmp)
340 NodeTraverse(tmp, converterFunctions, parserSettings, rootFirst);
341
342 c = c->next;
343 }
344 }
345
346
347
348
349
350 /**
351 Returns a named list whose elements are
352 file: the name of the file being processed.
353 version: the XML version.
354 root: the collection of children.
355 */
356 USER_OBJECT_
RS_XML(convertXMLDoc)357 RS_XML(convertXMLDoc)(const char *fileName, xmlDocPtr doc, USER_OBJECT_ converterFunctions,
358 R_XMLSettings *parserSettings)
359 {
360 USER_OBJECT_ rdoc;
361 USER_OBJECT_ rdoc_el_names, klass;
362 int n = NUM_DOC_ELEMENTS;
363 const char *version = "";
364 DECL_ENCODING_FROM_DOC(doc)
365
366
367 PROTECT(rdoc = NEW_LIST(n));
368 PROTECT(rdoc_el_names = NEW_CHARACTER(n));
369
370 /* Insert the name of the file being processed */
371 SET_VECTOR_ELT(rdoc, FILE_ELEMENT_NAME, NEW_CHARACTER(1));
372 SET_STRING_ELT(VECTOR_ELT(rdoc, FILE_ELEMENT_NAME), 0,
373 ENC_COPY_TO_USER_STRING(doc->name ? (const xmlChar*)doc->name : (const xmlChar*)fileName));
374 //SET_STRING_ELT(VECTOR_ELT(rdoc, FILE_ELEMENT_NAME), 0, ENC_COPY_TO_USER_STRING(doc->name ? XMLCHAR_TO_CHAR(doc->name) : fileName));
375 SET_STRING_ELT(rdoc_el_names, FILE_ELEMENT_NAME, COPY_TO_USER_STRING("file"));
376
377 /* Insert the XML version information */
378 SET_VECTOR_ELT(rdoc, VERSION_ELEMENT_NAME, NEW_CHARACTER(1));
379 if(doc->version)
380 version = XMLCHAR_TO_CHAR(doc->version);
381
382 SET_STRING_ELT(VECTOR_ELT(rdoc, VERSION_ELEMENT_NAME), 0,
383 COPY_TO_USER_STRING(version));
384 SET_STRING_ELT(rdoc_el_names, VERSION_ELEMENT_NAME, COPY_TO_USER_STRING("version"));
385
386 /* Compute the nodes for this tree, recursively.
387 Note the SIDEWAYS argument to get the sibling nodes
388 at the root, rather than just the first and its children.
389 */
390 {
391 xmlNodePtr root;
392 #ifdef USE_OLD_ROOT_CHILD_NAMES
393 root = doc->root;
394 #else
395 root = doc->xmlRootNode;
396
397 #ifdef ROOT_HAS_DTD_NODE
398 if(root->next && root->children == NULL)
399 root = root->next;
400 #endif
401 #endif
402 SET_VECTOR_ELT(rdoc, CHILDREN_ELEMENT_NAME, RS_XML(createNodeChildren)(root, SIDEWAYS, parserSettings));
403 }
404 SET_STRING_ELT(rdoc_el_names, CHILDREN_ELEMENT_NAME, COPY_TO_USER_STRING("children"));
405
406 SET_NAMES(rdoc, rdoc_el_names);
407
408 PROTECT(klass = NEW_CHARACTER(1));
409 SET_STRING_ELT(klass, 0, COPY_TO_USER_STRING("XMLDocumentContent"));
410 SET_CLASS(rdoc, klass);
411
412 UNPROTECT(3);
413
414 return(rdoc);
415 }
416
417 USER_OBJECT_
processNamespaceDefinitions(xmlNs * ns,xmlNodePtr node,R_XMLSettings * parserSettings)418 processNamespaceDefinitions(xmlNs *ns, xmlNodePtr node, R_XMLSettings *parserSettings)
419 {
420 int n = 0;
421 xmlNs *ptr = ns;
422 USER_OBJECT_ ans, tmp, names;
423 DECL_ENCODING_FROM_NODE(node)
424
425 while(ptr) {
426 ptr = ptr->next;
427 n++;
428 }
429 PROTECT(ans = NEW_LIST(n));
430 PROTECT(names = NEW_CHARACTER(n));
431
432 for(n = 0, ptr = ns; ptr ; n++, ptr = ptr->next) {
433 // protection suggested by rchk
434 tmp = PROTECT(RS_XML(createNameSpaceIdentifier)(ptr,node));
435 (void) RS_XML(notifyNamespaceDefinition)(tmp, parserSettings);
436 SET_VECTOR_ELT(ans, n, tmp);
437 UNPROTECT(1);
438 if(ptr->prefix)
439 SET_STRING_ELT(names, n, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(ptr->prefix)));
440 }
441
442 SET_NAMES(ans, names);
443 SET_CLASS(ans, mkString("XMLNamespaceDefinitions"));
444 UNPROTECT(2);
445 return(ans);
446 }
447
448 /**
449 Creates an R object representing the specified node, and its children
450 if recursive is non-zero. Certain types of nodes have
451
452 direction controls whether we take the siblings of this node
453 or alternatively its children.
454
455 parentUserNode the previously created user-leve node for the parent of the
456 target node.
457
458 */
459
460 enum { NODE_NAME, NODE_ATTRIBUTES, NODE_CHILDREN, NODE_NAMESPACE, NODE_NAMESPACE_DEFS, NUM_NODE_ELEMENTS};
461
462 USER_OBJECT_
getNamespaceDefs(xmlNodePtr node,int recursive)463 getNamespaceDefs(xmlNodePtr node, int recursive)
464 {
465 USER_OBJECT_ nsDef = NULL_USER_OBJECT;
466
467 if(node->nsDef || recursive) {
468 int numProtects = 0;
469 xmlNs *ptr = node->nsDef;
470 int n = 0;
471 while(ptr) {
472 n++; ptr = ptr->next;
473 }
474
475 PROTECT(nsDef = NEW_LIST(n)); numProtects++;
476 ptr = node->nsDef; n = 0;
477 while(ptr) {
478 SET_VECTOR_ELT(nsDef, n, RS_XML(createNameSpaceIdentifier)(ptr, node));
479 n++; ptr = ptr->next;
480 }
481
482 if(recursive && node->children) {
483 xmlNodePtr ptr = node->children;
484 USER_OBJECT_ tmp;
485 int i;
486
487 PROTECT(nsDef); numProtects++;
488 while(ptr) {
489 PROTECT(tmp = getNamespaceDefs(ptr, 1));
490 /* nsDef = Rf_appendList(nsDef, tmp); */
491 if(Rf_length(tmp)) {
492 n = Rf_length(nsDef);
493 PROTECT(SET_LENGTH(nsDef, n + Rf_length(tmp)));
494 for(i = 0; i < Rf_length(tmp); i++)
495 SET_VECTOR_ELT(nsDef, n + i, VECTOR_ELT(tmp, i));
496 UNPROTECT(3); /* old nsDef, tmp, new nsDef */
497 PROTECT(nsDef);
498 } else
499 UNPROTECT(1); /* tmp */
500 ptr = ptr->next;
501 }
502 }
503
504 SET_CLASS(nsDef, mkString("NamespaceDefinitionList"));
505 UNPROTECT(numProtects);
506 }
507 return(nsDef);
508 }
509
510 USER_OBJECT_
RS_XML(internalNodeNamespaceDefinitions)511 RS_XML(internalNodeNamespaceDefinitions)(USER_OBJECT_ r_node, USER_OBJECT_ recursive)
512 {
513 xmlNodePtr node;
514
515 if(TYPEOF(r_node) != EXTPTRSXP) {
516 Rf_error("R_internalNodeNamespaceDefinitions expects InternalXMLNode objects");
517 }
518
519 node = (xmlNodePtr) R_ExternalPtrAddr(r_node);
520 return(getNamespaceDefs(node, LOGICAL(recursive)[0]));
521 }
522
523 static USER_OBJECT_
RS_XML(createXMLNode)524 RS_XML(createXMLNode)(xmlNodePtr node, int recursive, int direction, R_XMLSettings *parserSettings, USER_OBJECT_ parentUserNode)
525 {
526 int n = NUM_NODE_ELEMENTS;
527 USER_OBJECT_ ans;
528 USER_OBJECT_ ans_el_names;
529 USER_OBJECT_ nsDef = NULL_USER_OBJECT;
530 int addValue;
531 DECL_ENCODING_FROM_NODE(node)
532 char *contentValue = XMLCHAR_TO_CHAR(node->content);
533
534 #ifdef ROOT_HAS_DTD_NODE
535 if(node->type == XML_DTD_NODE)
536 return(NULL);
537 #endif
538
539 if(parserSettings->trim) {
540 contentValue = trim(XMLCHAR_TO_CHAR(node->content));
541 }
542
543 addValue = (contentValue && strlen(contentValue) && isBlank(contentValue) == 0);
544
545 #ifdef LIBXML2
546 if(node->type == XML_ENTITY_DECL)
547 return(NULL);
548 #endif
549
550 /* Drop text nodes that are blank, if that is what the user wanted. */
551 if(parserSettings->skipBlankLines && addValue == 0 && node->type == XML_TEXT_NODE)
552 return(NULL);
553
554
555 if(addValue)
556 n++;
557
558
559 /* If we have a */
560 if(node->type != XML_ELEMENT_DECL) {
561
562
563 /* Create the default return value being a list of name, attributes, children
564 and possibly value.
565
566
567 */
568 PROTECT(ans = NEW_LIST(n));
569 PROTECT(ans_el_names = NEW_CHARACTER(n));
570
571 /* If there are namespace definitions within this node, */
572 if(node->nsDef) {
573 nsDef = processNamespaceDefinitions(node->nsDef, node, parserSettings);
574 SET_VECTOR_ELT(ans, NODE_NAMESPACE_DEFS, nsDef);
575 }
576
577
578 SET_VECTOR_ELT(ans, NODE_NAME, NEW_CHARACTER(1));
579 if(node->name)
580 SET_STRING_ELT(VECTOR_ELT(ans, NODE_NAME), 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->name)));
581
582 SET_VECTOR_ELT(ans, NODE_ATTRIBUTES, RS_XML(AttributeList)(node, parserSettings));
583
584 if(recursive)
585 SET_VECTOR_ELT(ans, NODE_CHILDREN, RS_XML(createNodeChildren)(node, direction, parserSettings));
586 else
587 SET_VECTOR_ELT(ans, NODE_CHILDREN, NULL_USER_OBJECT);
588
589
590
591 SET_STRING_ELT(ans_el_names, NODE_NAME, mkChar("name"));
592 SET_STRING_ELT(ans_el_names, NODE_ATTRIBUTES, mkChar("attributes"));
593 SET_STRING_ELT(ans_el_names, NODE_CHILDREN, mkChar("children"));
594 SET_STRING_ELT(ans_el_names, NODE_NAMESPACE, mkChar("namespace"));
595 SET_STRING_ELT(ans_el_names, NODE_NAMESPACE_DEFS, mkChar("namespaceDefinitions"));
596
597 if(node->ns) {
598 PROTECT(nsDef = NEW_CHARACTER(1));
599 if(!parserSettings->fullNamespaceInfo) {
600 if(node->ns->prefix) {
601 SET_STRING_ELT(nsDef, 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->ns->prefix)));
602 SET_CLASS(nsDef, mkString("XMLNamespacePrefix"));
603 }
604 } else {
605 if(node->ns->href)
606 SET_STRING_ELT(nsDef, 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->ns->href)));
607 if(node->ns->prefix)
608 SET_NAMES(nsDef, ScalarString(ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->ns->prefix)))); /* XXX change! */
609 SET_CLASS(nsDef, mkString("XMLNamespace"));
610 }
611 SET_VECTOR_ELT(ans, NODE_NAMESPACE, nsDef);
612 UNPROTECT(1);
613 }
614
615
616
617 if(addValue) {
618 SET_STRING_ELT(ans_el_names, NUM_NODE_ELEMENTS, COPY_TO_USER_STRING("value"));
619 SET_VECTOR_ELT(ans, NUM_NODE_ELEMENTS, NEW_CHARACTER(1));
620 SET_STRING_ELT(VECTOR_ELT(ans, NUM_NODE_ELEMENTS), 0, ENC_COPY_TO_USER_STRING(contentValue));
621
622 if(node->type == XML_ENTITY_REF_NODE)
623 SET_NAMES(VECTOR_ELT(ans, NUM_NODE_ELEMENTS), ScalarString(ENC_COPY_TO_USER_STRING(node->name)));
624 }
625
626 SET_NAMES(ans, ans_el_names);
627
628
629 /* Compute the class of this object based on the type in the
630 XML node.
631 */
632
633 RS_XML(setNodeClass)(node, ans);
634 } else {
635 /* XML_ELEMENT_DECL */
636 ans = NULL_USER_OBJECT;
637 PROTECT(ans);
638 PROTECT(ans);
639 }
640 /* Now invoke any user-level converters. */
641 if(recursive || direction)
642 ans = convertNode(ans, node, parserSettings);
643
644 UNPROTECT(1);
645 UNPROTECT(1);
646 return(ans);
647 }
648
649 static USER_OBJECT_
convertNode(USER_OBJECT_ ans,xmlNodePtr node,R_XMLSettings * parserSettings)650 convertNode(USER_OBJECT_ ans, xmlNodePtr node, R_XMLSettings *parserSettings)
651 {
652 USER_OBJECT_ val = ans;
653
654 if(parserSettings != NULL) {
655 USER_OBJECT_ fun = NULL;
656 const char *funName;
657
658 if(parserSettings->xinclude && (node->type == XML_XINCLUDE_START || node->type == XML_XINCLUDE_END)) {
659 return(NULL);
660 }
661
662
663 if(node->name) {
664 funName = XMLCHAR_TO_CHAR(node->name);
665 fun = RS_XML(findFunction)(funName, parserSettings->converters);
666 }
667
668 if(fun == NULL) {
669 /* Didn't find the tag-specific function in the handlers.
670 So see if there is one for this type node.
671 */
672 fun = RS_XML(lookupGenericNodeConverter)(node, ans, parserSettings);
673 }
674 if(fun != NULL) {
675 USER_OBJECT_ opArgs = NEW_LIST(1);
676 PROTECT(opArgs);
677 SET_VECTOR_ELT(opArgs, 0, ans);
678 val = RS_XML(invokeFunction)(fun, opArgs, NULL, NULL);
679 UNPROTECT(1);
680 }
681 }
682 return(val);
683 }
684
685
686 const char * const XMLNodeClassHierarchy[] = {"XMLNode", "RXMLAbstractNode", "XMLAbstractNode", "oldClass"};
687
688 int
RS_XML(setNodeClass)689 RS_XML(setNodeClass)(xmlNodePtr node, USER_OBJECT_ ans)
690 {
691 char *className = NULL;
692 int numEls = 1;
693 int lenHier = sizeof(XMLNodeClassHierarchy)/sizeof(XMLNodeClassHierarchy[0]);
694
695 numEls = lenHier + 1;
696
697 switch(node->type) {
698 case XML_ENTITY_REF_NODE:
699 className = "XMLEntityRef";
700 break;
701 case XML_PI_NODE:
702 className = "XMLProcessingInstruction";
703 break;
704 case XML_COMMENT_NODE:
705 className = "XMLCommentNode";
706 break;
707 case XML_TEXT_NODE:
708 className = "XMLTextNode";
709 break;
710 case XML_CDATA_SECTION_NODE:
711 className = "XMLCDataNode";
712 break;
713 #ifdef LIBXML2
714 case XML_ENTITY_DECL:
715 className = "XMLEntityDeclaration";
716 break;
717 #endif
718 default:
719 numEls--;
720 break;
721 }
722
723 if(1) {
724 USER_OBJECT_ Class;
725 int ctr = 0, i;
726 PROTECT(Class = NEW_CHARACTER(numEls));
727 if(className)
728 SET_STRING_ELT(Class, ctr++, mkChar(className));
729
730 for(i = 0; i < lenHier; i++)
731 SET_STRING_ELT(Class, ctr++, mkChar(XMLNodeClassHierarchy[i]));
732 SET_CLASS(ans, Class);
733 UNPROTECT(1);
734 }
735
736 return(node->type);
737 }
738
739
740 const char *RS_XML(NameSpaceSlotNames)[] = {"id", "uri", "local"};
741 enum {NAMESPACE_PREFIX_SLOT, NAMESPACE_URI_SLOT, NAMESPACE_TYPE_SLOT, NAMESPACE_NUM_SLOTS};
742
743 /**
744 Create a local object identifying the name space used by a particular node.
745 This is not the name space definition which would have a URL/URI and a type.
746 */
747 USER_OBJECT_
RS_XML(createNameSpaceIdentifier)748 RS_XML(createNameSpaceIdentifier)(xmlNs *space, xmlNodePtr node)
749 {
750
751 USER_OBJECT_ ans;
752 DECL_ENCODING_FROM_NODE(node)
753
754 if(node->nsDef) {
755 PROTECT(ans = NEW_LIST(3));
756 SET_VECTOR_ELT(ans, NAMESPACE_PREFIX_SLOT, NEW_CHARACTER(1));
757 SET_STRING_ELT(VECTOR_ELT(ans, NAMESPACE_PREFIX_SLOT), 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR( (space->prefix ? space->prefix : (xmlChar*)""))));
758
759 SET_VECTOR_ELT(ans, NAMESPACE_URI_SLOT, NEW_CHARACTER(1));
760 SET_STRING_ELT(VECTOR_ELT(ans, NAMESPACE_URI_SLOT), 0, space->href ? ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(space->href)) : NA_STRING);
761
762
763 SET_VECTOR_ELT(ans, NAMESPACE_TYPE_SLOT, NEW_LOGICAL(1));
764 LOGICAL_DATA(VECTOR_ELT(ans, NAMESPACE_TYPE_SLOT))[0] = (space->type == XML_LOCAL_NAMESPACE);
765
766 RS_XML(SetNames)(NAMESPACE_NUM_SLOTS, RS_XML(NameSpaceSlotNames), ans);
767
768 {
769 USER_OBJECT_ klass;
770 PROTECT(klass = NEW_CHARACTER(1));
771 SET_STRING_ELT(klass, 0, COPY_TO_USER_STRING("XMLNamespaceDefinition"));
772 SET_CLASS(ans, klass);
773 UNPROTECT(1);
774 }
775 UNPROTECT(1);
776 } else {
777 PROTECT(ans = NEW_CHARACTER(1));
778 if(space->prefix)
779 SET_STRING_ELT(ans, 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(space->prefix)));
780 UNPROTECT(1);
781 }
782
783
784 return(ans);
785 }
786
787 /**
788 Attempt to find a function in the handler methods corresponding to the
789 type of the node, not its specific tag name.
790 */
791 USER_OBJECT_
RS_XML(lookupGenericNodeConverter)792 RS_XML(lookupGenericNodeConverter)(xmlNodePtr node, USER_OBJECT_ defaultNodeValue,
793 R_XMLSettings *parserSettings)
794 {
795 #define DOT(x) parserSettings->useDotNames ? "." x : x
796 char *name;
797 USER_OBJECT_ fun = NULL;
798 switch(node->type) {
799 case XML_ENTITY_REF_NODE:
800 name = DOT("entity");
801 break;
802 case XML_ENTITY_NODE:
803 name = DOT("entity");
804 break;
805 case XML_ELEMENT_NODE:
806 name = DOT("startElement");
807 break;
808 case XML_PI_NODE:
809 name = DOT("proccesingInstruction");
810 break;
811 case XML_COMMENT_NODE:
812 name = DOT("comment");
813 break;
814 case XML_TEXT_NODE:
815 name = DOT("text");
816 break;
817 case XML_CDATA_SECTION_NODE:
818 name = DOT("cdata");
819 break;
820 default:
821 name = NULL;
822 }
823
824 if(name && name[0])
825 fun = RS_XML(findFunction)(name, parserSettings->converters);
826
827 return(fun);
828 }
829
830 /*
831 XXX Unravel this recursive call into a loop.
832
833 Starting at the top node, fix the id to be empty.
834 Then add the node and get the ID.
835 Then loop over the children, and the node and call the routine
836 on its children
837
838
839 */
840
841 /*
842 at a given node, make the node
843 */
844 void
addNodeAndChildrenToTree(xmlNodePtr node,SEXP id,SEXP e,R_XMLSettings * parserSettings,int * ctr)845 addNodeAndChildrenToTree(xmlNodePtr node, SEXP id, SEXP e, R_XMLSettings *parserSettings, int *ctr)
846 {
847 SEXP tmp;
848 xmlNodePtr n;
849
850 if(!node)
851 return;
852
853 /* Create a skeleton node with no children. */
854 tmp = RS_XML(createXMLNode)(node, 0, 0/* doesn't matter */, parserSettings, R_NilValue);/*XXX*/
855 if(!tmp)
856 return;
857 SETCAR(CDR(e), tmp);
858 (*ctr)++;
859
860 id = Rf_eval(e, R_GlobalEnv);
861 PROTECT(id);
862
863 n = node->children;
864 while(n) {
865
866 SETCAR(CDR(CDR(e)), id);
867 addNodeAndChildrenToTree(n, id, e, parserSettings, ctr);
868 (*ctr)++;
869 n = n->next;
870 }
871
872 UNPROTECT(1);
873 }
874
875
876
877 SEXP
addNodesToTree(xmlNodePtr node,R_XMLSettings * parserSettings)878 addNodesToTree(xmlNodePtr node, R_XMLSettings *parserSettings)
879 {
880 xmlNodePtr ptr = node;
881 SEXP e, id;
882 int ctr = 0;
883 PROTECT(e = allocVector(LANGSXP, 3));
884 SETCAR(e, parserSettings->converters);
885 PROTECT(id = NEW_CHARACTER(0));
886
887 ptr = node;
888
889 /* loop over the sibling nodes here in case we have multiple roots,
890 e.g. a comment, PI and a real node. See xysize.svg
891 */
892 while(ptr) {
893 SETCAR(CDR(CDR(e)), id);
894 addNodeAndChildrenToTree(ptr, id, e, parserSettings, &ctr);
895 ptr = ptr->next;
896 }
897
898 UNPROTECT(2); /* e, id */
899 return(ScalarInteger(ctr));
900 }
901
902
903
904 /**
905 Creates the R objects representing the children or siblings of the specified
906 node, handling simple text cases with no children, as well as recursively
907 processing the children.
908
909 node the node whose children or siblings should be converted.
910
911 direction DOWN or SIDEWAYS indicating the children or siblings should
912 be processed, respectively. If SIDEWAYS is specified, the node itself
913 is included in the result.
914
915 parserSettings "global" information about the parsing conversion for the duration of the parser.
916
917
918 Return list of XMLNode objects.
919 */
920 USER_OBJECT_
RS_XML(createNodeChildren)921 RS_XML(createNodeChildren)(xmlNodePtr node, int direction, R_XMLSettings *parserSettings)
922 {
923 int n = 0, i;
924 USER_OBJECT_ ans = NULL_USER_OBJECT;
925 USER_OBJECT_ elNames = NULL;
926 int unProtect = 0;
927 xmlNodePtr base, c = (direction == SIDEWAYS) ? node :
928 #ifndef USE_OLD_ROOT_CHILD_NAMES
929 node->xmlChildrenNode;
930 #else
931 node->childs;
932 #endif
933
934 DECL_ENCODING_FROM_NODE(node)
935
936 base = c;
937
938 if(IS_FUNCTION(parserSettings->converters)) {
939 return(addNodesToTree(node, parserSettings));
940 }
941
942 /* Count the number of elements being converted. */
943 while(c) {
944 c = c->next;
945 n++;
946 }
947
948 if(n > 0) {
949 USER_OBJECT_ tmp;
950 USER_OBJECT_ tmpNames;
951 int count = 0;
952
953
954 c = base;
955
956 PROTECT(ans = NEW_LIST(n));
957 PROTECT(elNames = NEW_CHARACTER(n));
958
959 unProtect = 2;
960
961 for(i = 0; i < n; i++, c = c->next) {
962 tmp = RS_XML(createXMLNode)(c, 1, DOWN, parserSettings, ans);
963 if(tmp && tmp != NULL_USER_OBJECT) {
964 SET_VECTOR_ELT(ans, count, tmp);
965 if(c->name)
966 SET_STRING_ELT(elNames, count, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(c->name)));
967 count++;
968 }
969 }
970
971 if(count < n) {
972 /* Reset the length! */
973 #ifdef USE_S
974 #else
975 PROTECT(tmp = NEW_LIST(count));
976 PROTECT(tmpNames = NEW_CHARACTER(count));
977 for(i = 0 ; i < count ; i++) {
978 SET_VECTOR_ELT(tmp, i, VECTOR_ELT(ans, i));
979 SET_STRING_ELT(tmpNames, i, STRING_ELT(elNames, i));
980 }
981 ans = tmp;
982 SET_NAMES(ans, tmpNames);
983 UNPROTECT(4);
984 PROTECT(ans);
985 unProtect = 1;
986 #endif
987 } else {
988 SET_NAMES(ans, elNames);
989 }
990
991 if(unProtect > 0)
992 UNPROTECT(unProtect);
993 }
994
995 return(ans);
996 }
997
998
999
1000 USER_OBJECT_
RS_XML(notifyNamespaceDefinition)1001 RS_XML(notifyNamespaceDefinition)(USER_OBJECT_ arg, R_XMLSettings *parserSettings)
1002 {
1003 USER_OBJECT_ fun, ans = NULL_USER_OBJECT;
1004
1005 fun = RS_XML(findFunction)("namespace", parserSettings->converters);
1006 if(fun != NULL) {
1007 USER_OBJECT_ opArgs = NEW_LIST(1);
1008 USER_OBJECT_ tmp;
1009 PROTECT(opArgs);
1010 SET_VECTOR_ELT(opArgs, 0, arg);
1011 tmp = RS_XML(invokeFunction)(fun, opArgs, NULL, NULL);
1012 ans = tmp;
1013 UNPROTECT(1);
1014 }
1015
1016 return(ans);
1017 }
1018
1019 #ifdef USE_XML_VERSION_H
1020 #ifndef LIBXML_TEST_VERSION
1021 #include <libxml/xmlversion.h>
1022 #endif
1023 #endif
1024
1025 USER_OBJECT_
RS_XML(libxmlVersion)1026 RS_XML(libxmlVersion)()
1027 {
1028 USER_OBJECT_ ans;
1029 unsigned int val;
1030
1031 #ifdef LIBXML_VERSION_NUMBER
1032 val = LIBXML_VERSION_NUMBER;
1033 #else
1034 #ifdef LIBXML_VERSION
1035 val = LIBXML_VERSION;
1036 #else
1037 val = 0;
1038 #endif
1039 #endif
1040
1041 ans = NEW_NUMERIC(1);
1042 NUMERIC_DATA(ans)[0] = val;
1043 return(ans);
1044 }
1045
1046
1047
1048 static
1049 void
notifyError(const char * msg,va_list ap,Rboolean isError)1050 notifyError(const char *msg, va_list ap, Rboolean isError)
1051 {
1052 #if 0
1053 if(isError) {
1054 Rf_error("error in validating XML document");
1055 } else {
1056 Rf_error("warning when validating XML document");
1057 }
1058
1059 #else
1060 #define BUFSIZE 2048
1061 char buf[BUFSIZE];
1062
1063 memset(buf, '\0', BUFSIZE);
1064 vsnprintf(buf, BUFSIZE, msg, ap);
1065
1066 Rf_warning(buf);
1067 #endif
1068 }
1069
1070
1071
1072 void
RS_XML(ValidationError)1073 RS_XML(ValidationError)(void *ctx, const char *format, ...)
1074 {
1075 char *msg = "Message unavailable";
1076 va_list(ap);
1077 va_start(ap, format);
1078
1079 if(strcmp(format, "%s") == 0)
1080 msg = va_arg(ap, char *);
1081
1082 va_end(ap);
1083 notifyError(msg, ap, TRUE);
1084 }
1085
1086 void
RS_XML(ValidationWarning)1087 RS_XML(ValidationWarning)(void *ctx, const char *format, ...)
1088 {
1089 char *msg = "Message unavailable";
1090 va_list(ap);
1091 va_start(ap, format);
1092
1093 if(strcmp(format, "%s") == 0)
1094 msg = va_arg(ap, char *);
1095
1096 va_end(ap);
1097 notifyError(msg, ap, FALSE);
1098 }
1099
1100
1101 USER_OBJECT_
R_createXMLNode(USER_OBJECT_ snode,USER_OBJECT_ handlers,USER_OBJECT_ r_trim,USER_OBJECT_ r_skipBlankLines)1102 R_createXMLNode(USER_OBJECT_ snode, USER_OBJECT_ handlers, USER_OBJECT_ r_trim, USER_OBJECT_ r_skipBlankLines)
1103 {
1104 xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1105 R_XMLSettings parserSettings;
1106
1107 parserSettings.converters = handlers;
1108 parserSettings.trim = LOGICAL(r_trim)[0];
1109 parserSettings.skipBlankLines = LOGICAL(r_skipBlankLines)[0];
1110
1111 return(RS_XML(createNodeChildren)(node, SIDEWAYS, &parserSettings));
1112 }
1113
1114
1115
1116 USER_OBJECT_
RS_XML_xmlNodeName(USER_OBJECT_ snode)1117 RS_XML_xmlNodeName(USER_OBJECT_ snode)
1118 {
1119 xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1120 USER_OBJECT_ ans;
1121 DECL_ENCODING_FROM_NODE(node)
1122
1123
1124 PROTECT(ans = NEW_CHARACTER(1));
1125 SET_STRING_ELT(ans, 0, node->name ? ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->name)) : R_NaString);
1126 UNPROTECT(1);
1127 return(ans);
1128 }
1129
1130
1131 USER_OBJECT_
RS_XML_xmlNodeNamespace(USER_OBJECT_ snode)1132 RS_XML_xmlNodeNamespace(USER_OBJECT_ snode)
1133 {
1134 xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1135 USER_OBJECT_ ans;
1136 xmlNs *ns;
1137 DECL_ENCODING_FROM_NODE(node)
1138
1139 ns = node->ns;
1140 if(!ns)
1141 return(NEW_CHARACTER(0));
1142
1143 PROTECT(ans = NEW_CHARACTER(1));
1144 if(ns->href)
1145 SET_STRING_ELT(ans, 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(ns->href)));
1146 if(ns->prefix)
1147 SET_NAMES(ans, ScalarString(ENC_COPY_TO_USER_STRING(ns->prefix)));
1148
1149 SET_CLASS(ans, mkString("XMLNamespace"));
1150 UNPROTECT(1);
1151 return(ans);
1152 }
1153
1154 enum {
1155 R_XML_NS_ADD_PREFIX = 1,
1156 R_XML_NS_ADD_URL_DEFS = 2
1157 };
1158
1159 USER_OBJECT_
RS_XML_xmlNodeAttributes(USER_OBJECT_ snode,USER_OBJECT_ addNamespaces,USER_OBJECT_ addNamespaceURLs)1160 RS_XML_xmlNodeAttributes(USER_OBJECT_ snode, USER_OBJECT_ addNamespaces, USER_OBJECT_ addNamespaceURLs)
1161 {
1162 xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1163 R_XMLSettings parserSettings;
1164 parserSettings.addAttributeNamespaces = 0;
1165 if(LOGICAL_DATA(addNamespaces)[0])
1166 parserSettings.addAttributeNamespaces |= R_XML_NS_ADD_PREFIX;
1167 if(LOGICAL_DATA(addNamespaceURLs)[0])
1168 parserSettings.addAttributeNamespaces |= R_XML_NS_ADD_URL_DEFS;
1169
1170 return(RS_XML(AttributeList)(node, &parserSettings));
1171 }
1172
1173 USER_OBJECT_
RS_XML_xmlNodeParent(USER_OBJECT_ snode,USER_OBJECT_ manageMemory)1174 RS_XML_xmlNodeParent(USER_OBJECT_ snode, USER_OBJECT_ manageMemory)
1175 {
1176 xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1177 if(node->parent && (node->parent->type == XML_DOCUMENT_NODE || node->parent->type == XML_HTML_DOCUMENT_NODE))
1178 return(NULL_USER_OBJECT);
1179 return(R_createXMLNodeRef(node->parent, manageMemory));
1180 }
1181
1182
1183 USER_OBJECT_
RS_XML_xmlNodeNumChildren(USER_OBJECT_ snode)1184 RS_XML_xmlNodeNumChildren(USER_OBJECT_ snode)
1185 {
1186 xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1187 int count = 0;
1188 xmlNodePtr ptr = node->children;
1189
1190 while(ptr) {
1191 count++;
1192 ptr = ptr->next;
1193 }
1194 return(ScalarInteger(count));
1195 }
1196
1197
1198 USER_OBJECT_
RS_XML_xmlNodeChildrenReferences(USER_OBJECT_ snode,USER_OBJECT_ r_addNames,USER_OBJECT_ manageMemory)1199 RS_XML_xmlNodeChildrenReferences(USER_OBJECT_ snode, USER_OBJECT_ r_addNames, USER_OBJECT_ manageMemory)
1200 {
1201 xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1202 USER_OBJECT_ ans, names = R_NilValue;
1203 int count = 0, i;
1204 xmlNodePtr ptr = node->children;
1205 int addNames = LOGICAL(r_addNames)[0];
1206 DECL_ENCODING_FROM_NODE(node)
1207 int nprot = 0;
1208
1209 while(ptr) {
1210 count++;
1211 ptr = ptr->next;
1212 }
1213
1214 ptr = node->children;
1215
1216 PROTECT(ans = NEW_LIST(count)); nprot++;
1217 if(addNames) {
1218 PROTECT(names = NEW_CHARACTER(count));
1219 nprot++;
1220 }
1221
1222 for(i = 0; i < count ; i++, ptr = ptr->next) {
1223 SET_VECTOR_ELT(ans, i, R_createXMLNodeRef(ptr, manageMemory));
1224 if(addNames)
1225 SET_STRING_ELT(names, i, ENC_COPY_TO_USER_STRING(ptr->name ? ptr->name : (const xmlChar *)""));
1226 }
1227 if(addNames)
1228 SET_NAMES(ans, names);
1229 UNPROTECT(nprot);
1230
1231 return(ans);
1232 }
1233
1234 USER_OBJECT_
R_getNodeChildByIndex(USER_OBJECT_ snode,USER_OBJECT_ r_index,USER_OBJECT_ manageMemory)1235 R_getNodeChildByIndex(USER_OBJECT_ snode, USER_OBJECT_ r_index, USER_OBJECT_ manageMemory)
1236 {
1237 xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1238 int count = 0, num;
1239 xmlNodePtr ptr = node->children;
1240
1241 num = INTEGER(r_index)[0] - 1;
1242 if(num < 0) {
1243 Rf_error("cannot index an internal node with a negative number %d", num);
1244 }
1245
1246
1247 while(ptr && count < num) {
1248 count++;
1249 ptr = ptr->next;
1250 }
1251
1252 return(ptr ? R_createXMLNodeRef(ptr, manageMemory) : NULL_USER_OBJECT);
1253 }
1254
1255
1256
1257 static USER_OBJECT_
makeSchemaReference(xmlSchemaPtr schema)1258 makeSchemaReference(xmlSchemaPtr schema)
1259 {
1260 return(R_makeRefObject(schema, "xmlSchemaRef"));
1261 /*
1262 USER_OBJECT_ ans;
1263 PROTECT(ans = R_MakeExternalPtr(schema, Rf_install("XMLSchema"), R_NilValue));
1264 SET_CLASS(ans, mkString("XMLSchema"));
1265 UNPROTECT(1);
1266 return(ans);
1267 */
1268 }
1269
1270 // unused
1271 #define NO_XML_MEMORY_SHOW_ROUTINE 1
1272
1273 #ifndef NO_XML_MEMORY_SHOW_ROUTINE
1274 void
RS_XML_MemoryShow()1275 RS_XML_MemoryShow()
1276 {
1277 xmlMemDisplay(stderr);
1278 }
1279 #endif
1280
1281 USER_OBJECT_
RS_XML_setDocumentName(USER_OBJECT_ sdoc,USER_OBJECT_ sname)1282 RS_XML_setDocumentName(USER_OBJECT_ sdoc, USER_OBJECT_ sname)
1283 {
1284 /* if doc is NULL in C , return NULL in R
1285 If doc->name is NULL in C, return NA
1286 Otherwise, return the string.
1287 */
1288 xmlDocPtr doc = (xmlDocPtr) R_ExternalPtrAddr(sdoc);
1289
1290 if(!doc) {
1291 Rf_warning("NULL pointer supplied for internal document");
1292 return(R_NilValue);
1293 }
1294
1295 doc->URL = xmlStrdup(CHAR_TO_XMLCHAR(CHAR_DEREF(STRING_ELT(sname, 0))));
1296
1297 return(sdoc);
1298 }
1299
1300
1301
1302 USER_OBJECT_
RS_XML_getDocumentName(USER_OBJECT_ sdoc)1303 RS_XML_getDocumentName(USER_OBJECT_ sdoc)
1304 {
1305 /* if doc is NULL in C , return NULL in R
1306 If doc->name is NULL in C, return NA
1307 Otherwise, return the string.
1308 */
1309 xmlDocPtr doc = (xmlDocPtr) R_ExternalPtrAddr(sdoc);
1310 USER_OBJECT_ ans;
1311 const xmlChar *encoding;
1312 if(!doc) {
1313 Rf_warning("NULL pointer supplied for internal document");
1314 return(R_NilValue);
1315 }
1316 encoding = doc->encoding;
1317 PROTECT(ans = NEW_CHARACTER(1));
1318 SET_STRING_ELT(ans, 0, doc->URL ? ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(doc->URL)) : R_NaString);
1319 UNPROTECT(1);
1320 return(ans);
1321 }
1322
1323
1324 USER_OBJECT_
RS_XML_setKeepBlanksDefault(USER_OBJECT_ val)1325 RS_XML_setKeepBlanksDefault(USER_OBJECT_ val)
1326 {
1327 int prev;
1328 prev = xmlKeepBlanksDefault(INTEGER(val)[0]);
1329 return(ScalarInteger(prev));
1330 }
1331
1332
1333 USER_OBJECT_
RS_XML_xmlXIncludeProcessFlags(USER_OBJECT_ r_doc,USER_OBJECT_ r_flags)1334 RS_XML_xmlXIncludeProcessFlags(USER_OBJECT_ r_doc, USER_OBJECT_ r_flags)
1335 {
1336 xmlDocPtr doc = (xmlDocPtr) R_ExternalPtrAddr(r_doc);
1337 int ans;
1338
1339 ans = xmlXIncludeProcessFlags(doc, INTEGER(r_flags)[0]);
1340 return(ScalarInteger(ans));
1341 }
1342
1343 USER_OBJECT_
RS_XML_xmlXIncludeProcessTreeFlags(USER_OBJECT_ r_node,USER_OBJECT_ r_flags)1344 RS_XML_xmlXIncludeProcessTreeFlags(USER_OBJECT_ r_node, USER_OBJECT_ r_flags)
1345 {
1346 xmlNodePtr node;
1347 int flags = INTEGER(r_flags)[0];
1348 int n;
1349 //xmlNodePtr prev, parent;
1350 SEXP ans = R_NilValue;
1351
1352 node = (xmlNodePtr) R_ExternalPtrAddr(r_node);
1353 //prev = node->prev;
1354 //parent = node->parent;
1355
1356 n = xmlXIncludeProcessTreeFlags(node, flags);
1357
1358 if(n == 0)
1359 return(R_NilValue);
1360 else if(n == -1) {
1361 Rf_error("failed in XInclude");
1362 }
1363
1364 #if 0
1365 if(!prev) {
1366 fprintf(stderr, "Adding to children of %s\n", prev->name);
1367 prev = parent->children;
1368 } else {
1369 fprintf(stderr, "Adding after %s\n", prev->name);
1370 prev = prev->next;
1371 }
1372
1373 prev = node->next;
1374
1375 PROTECT(ans = NEW_LIST(n));
1376 for(i = 0; i < n; i++) {
1377 SET_VECTOR_ELT(ans, i, prev ? R_createXMLNodeRef(prev) : R_NilValue);
1378 prev = prev->next;
1379 }
1380 UNPROTECT(1);
1381 #endif
1382
1383 return(ans);
1384 }
1385
1386
1387
1388 /**
1389 Create an R named list containing the attributes of the specified node.
1390 */
1391
1392 /*
1393 We could use the CONS mechanism rather than doing a double pass.
1394 Not certain what is quicker in this situation. Also, doesn't
1395 work that way in S4, so keep it this way.
1396 */
1397 USER_OBJECT_
RS_XML(AttributeList)1398 RS_XML(AttributeList)(xmlNodePtr node, R_XMLSettings *parserSettings)
1399 {
1400 USER_OBJECT_ ans = NULL_USER_OBJECT;
1401 USER_OBJECT_ ans_names;
1402 xmlAttr * atts;
1403 const xmlChar *encoding = node->doc ? node->doc->encoding : NULL;
1404
1405 int n = 0, i;
1406
1407 /* Count the number of attributes*/
1408 atts = node->properties;
1409
1410 while(atts) {
1411 n++;
1412 atts = atts->next;
1413 }
1414
1415 if(n > 0) {
1416 SEXP ans_namespaces, ans_namespaceDefs;
1417 int nonTrivialAttrNamespaces = 0;
1418 int addNSPrefix = parserSettings->addAttributeNamespaces & R_XML_NS_ADD_PREFIX;
1419 int retNSDefs = parserSettings->addAttributeNamespaces & R_XML_NS_ADD_URL_DEFS;
1420
1421 PROTECT(ans = NEW_CHARACTER(n));
1422 PROTECT(ans_names = NEW_CHARACTER(n));
1423 PROTECT(ans_namespaces = NEW_CHARACTER(n));
1424 PROTECT(ans_namespaceDefs = NEW_CHARACTER(retNSDefs ? n : 0));
1425
1426 /* Loop over the attributes and create the string elements
1427 and the elements of the name vector.
1428 */
1429
1430 atts = node->properties;
1431
1432 for(i=0; i < n ; i++) {
1433 /* Have to be careful that atts->val and atts->val->context are non-null. Something like
1434 <a href=""> kills it otherwise.
1435 */
1436 #ifdef LIBXML2
1437 SET_STRING_ELT(ans, i,
1438 ENC_COPY_TO_USER_STRING(
1439 XMLCHAR_TO_CHAR(
1440 ((atts->xmlChildrenNode != (xmlNode*)NULL && atts->xmlChildrenNode->content != (xmlChar*)NULL )
1441 ? atts->xmlChildrenNode->content : (xmlChar*)""))));
1442 #else
1443 SET_STRING_ELT(ans, i, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(((atts->val != (xmlNode*)NULL && atts->val->content != (xmlChar*)NULL )
1444 ? atts->val->content : (xmlChar*)""))));
1445
1446 #endif
1447 if(atts->name) {
1448 if(addNSPrefix && atts->ns && atts->ns->prefix) {
1449 char buf[400];
1450 sprintf(buf, "%s:%s", atts->ns->prefix, atts->name);
1451 SET_STRING_ELT(ans_names, i, ENC_COPY_TO_USER_STRING(buf));
1452 } else
1453 SET_STRING_ELT(ans_names, i, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(atts->name)));
1454
1455 if((addNSPrefix | retNSDefs) && atts->ns && atts->ns->prefix) {
1456 SET_STRING_ELT(ans_namespaces, i, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(atts->ns->prefix)));
1457
1458 if(retNSDefs)
1459 SET_STRING_ELT(ans_namespaceDefs, i, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(atts->ns->href)));
1460
1461 nonTrivialAttrNamespaces++;
1462 }
1463 }
1464
1465 atts = atts->next;
1466 }
1467
1468 if(nonTrivialAttrNamespaces) {
1469 if(retNSDefs)
1470 Rf_setAttrib(ans_namespaces, Rf_install("names"), ans_namespaceDefs);
1471 Rf_setAttrib(ans, Rf_install("namespaces"), ans_namespaces);
1472 }
1473 SET_NAMES(ans, ans_names);
1474
1475 UNPROTECT(4);
1476 }
1477 #if 0
1478 else
1479 ans = NEW_CHARACTER(0);
1480 #endif
1481
1482 return(ans);
1483 }
1484
1485
1486
1487 SEXP
R_getDocEncoding(SEXP r_doc)1488 R_getDocEncoding(SEXP r_doc)
1489 {
1490 xmlDocPtr doc = (xmlDocPtr) R_ExternalPtrAddr(r_doc);
1491
1492 const xmlChar *encoding;
1493 SEXP ans;
1494
1495 if(doc->type != XML_DOCUMENT_NODE && doc->type != XML_HTML_DOCUMENT_NODE)
1496 doc = ((xmlNodePtr) doc)->doc;
1497 if(!doc)
1498 return(NEW_CHARACTER(0));
1499
1500 encoding = doc->encoding;
1501 PROTECT(ans = NEW_CHARACTER(1));
1502 SET_STRING_ELT(ans, 0, encoding ? CreateCharSexpWithEncoding(doc->encoding, doc->encoding) : R_NaString);
1503 UNPROTECT(1);
1504
1505 return(ans);
1506 }
1507
1508
1509 int
getTextElementLineNumber(xmlNodePtr node)1510 getTextElementLineNumber(xmlNodePtr node)
1511 {
1512 int val = -1;
1513
1514 if(node->parent)
1515 val = node->parent->line;
1516
1517 xmlNodePtr prev = node->prev;
1518 while(prev) {
1519 if(prev->line > 0) {
1520 val = prev->line;
1521 break;
1522 }
1523 prev = prev->prev;
1524 }
1525 return(val);
1526 }
1527
1528 SEXP
R_getLineNumber(SEXP r_node)1529 R_getLineNumber(SEXP r_node)
1530 {
1531 xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(r_node);
1532
1533 if(!node) {
1534 return(NEW_INTEGER(0));
1535 }
1536
1537 // XML_GET_LINE(node)
1538 return(ScalarInteger(node->line == 0 ?
1539 getTextElementLineNumber(node) : node->line));
1540 }
1541
1542
1543 SEXP
R_xmlReadFile(SEXP r_filename,SEXP r_encoding,SEXP r_options)1544 R_xmlReadFile(SEXP r_filename, SEXP r_encoding, SEXP r_options) //, SEXP manageMemory)
1545 {
1546 const char *filename;
1547 const char *encoding = NULL;
1548 int options;
1549 xmlDocPtr doc;
1550
1551 filename = CHAR_DEREF(STRING_ELT(r_filename, 0));
1552 if(Rf_length(r_encoding))
1553 encoding = CHAR_DEREF(STRING_ELT(r_encoding, 0));
1554 options = INTEGER(r_options)[0];
1555
1556 doc = xmlReadFile(filename, encoding, options);
1557 return(R_createXMLDocRef(doc));
1558 }
1559
1560 SEXP
R_xmlReadMemory(SEXP r_txt,SEXP len,SEXP r_encoding,SEXP r_options,SEXP r_base)1561 R_xmlReadMemory(SEXP r_txt, SEXP len, SEXP r_encoding, SEXP r_options, SEXP r_base) //, SEXP manageMemory)
1562 {
1563 const char *txt;
1564 const char *encoding = NULL;
1565 const char *baseURL = NULL;
1566 int options;
1567 xmlDocPtr doc;
1568
1569 txt = CHAR_DEREF(STRING_ELT(r_txt, 0));
1570 if(Rf_length(r_encoding))
1571 encoding = CHAR_DEREF(STRING_ELT(r_encoding, 0));
1572 options = INTEGER(r_options)[0];
1573
1574 if(Rf_length(r_base))
1575 baseURL = CHAR_DEREF(STRING_ELT(r_base, 0));
1576
1577 doc = xmlReadMemory(txt, INTEGER(len)[0], baseURL, encoding, options);
1578 return(R_createXMLDocRef(doc));
1579 }
1580
1581
1582
1583 #if 1
1584
1585 int
addXInclude(xmlNodePtr ptr,SEXP * ans,int level,SEXP manageMemory)1586 addXInclude(xmlNodePtr ptr, SEXP *ans, int level, SEXP manageMemory)
1587 {
1588 if(ptr->type == XML_XINCLUDE_START) {
1589 int len = Rf_length(*ans) + 1;
1590 SEXP oans = *ans; // avoid sequence-point error
1591 PROTECT(*ans = SET_LENGTH(oans, len));
1592 SET_VECTOR_ELT(*ans, len - 1, R_createXMLNodeRef(ptr, manageMemory));
1593 UNPROTECT(1);
1594 return(1);
1595 } else
1596 return(0);
1597
1598 }
1599
1600 int
processKids(xmlNodePtr ptr,SEXP * ans,int level,SEXP manageMemory)1601 processKids(xmlNodePtr ptr, SEXP *ans, int level, SEXP manageMemory)
1602 {
1603 xmlNodePtr kids;
1604 int count = 0;
1605 kids = ptr->children;
1606 while(kids) {
1607 count += addXInclude(kids, ans, level, manageMemory);
1608 count += processKids(kids, ans, level + 1, manageMemory);
1609 kids = kids->next;
1610 }
1611 return(count);
1612 }
1613
1614 #if 0
1615 int
1616 findXIncludeStartNodes(xmlNodePtr node, SEXP *ans, int level)
1617 {
1618 const char * prefix[] = {"", " ", " ", " " };
1619 xmlNodePtr ptr = node;
1620 int count = 0;
1621
1622 addXInclude(node, ans, level);
1623 ptr = node;
1624 while(ptr) {
1625 count += addXInclude(ptr, ans, level);
1626 count += processKids(ptr, ans, level);
1627 ptr = ptr->next;
1628 }
1629
1630 //fprintf(stderr, "%s level = %d, %s: %p, type = %d\n", prefix[level], level, ptr->name, node, ptr->type);
1631
1632 //fprintf(stderr, "%p, %s, level = %d, type = %d\n", ptr, ptr->name, level, ptr->type);
1633
1634 return(count);
1635 }
1636 #endif
1637
1638 /*
1639 This is a recursive version. We want an iterative version.
1640 */
1641 SEXP
R_findXIncludeStartNodes(SEXP r_root,SEXP manageMemory)1642 R_findXIncludeStartNodes(SEXP r_root, SEXP manageMemory)
1643 {
1644 xmlNodePtr root;
1645 SEXP ans;
1646
1647 root = (xmlNodePtr) R_ExternalPtrAddr(r_root);
1648 if(!root)
1649 return(R_NilValue);
1650
1651 PROTECT(ans = allocVector(VECSXP, 0));
1652 addXInclude(root, &ans, 0, manageMemory);
1653 processKids(root, &ans, 0, manageMemory);
1654 UNPROTECT(1);
1655 return(ans);
1656 }
1657
1658 #endif
1659