1 /**
2   Routines for parsing and processing an XML document
3   into an R/S data structure.
4 
5 
6  * See Copyright for the license status of this software.
7 
8  */
9 
10 #include "DocParse.h"
11 
12 #define R_USE_XML_ENCODING 1
13 #include "Utils.h"  /* For isBlank() */
14 
15 
16                     /* For the call to stat. */
17 #include <sys/stat.h>
18 #include <unistd.h>
19 
20 #include "RSDTD.h"
21 
22 #include <stdarg.h>
23 
24 
25 #include <libxml/xmlschemas.h>
26 #include <libxml/xinclude.h>
27 
28 
29 
30 int RS_XML(setNodeClass)(xmlNodePtr node, USER_OBJECT_ ans);
31 USER_OBJECT_ RS_XML(notifyNamespaceDefinition)(USER_OBJECT_ ns, R_XMLSettings *parserSettings);
32 
33 
34 void RS_XML(ValidationWarning)(void *ctx, const char *msg, ...);
35 void RS_XML(ValidationError)(void *ctx, const char *msg, ...);
36 
37 
38 static USER_OBJECT_ convertNode(USER_OBJECT_ ans, xmlNodePtr node, R_XMLSettings *parserSettings);
39 static void NodeTraverse(xmlNodePtr doc, USER_OBJECT_ converterFunctions, R_XMLSettings *parserSettings, int rootFirst);
40 
41 
42 static USER_OBJECT_ makeSchemaReference(xmlSchemaPtr ref);
43 
44 
45 
46 USER_OBJECT_
RS_XML(libxmlVersionRuntime)47 RS_XML(libxmlVersionRuntime)()
48 {
49     return(mkString(*__xmlParserVersion()));
50 }
51 
52 
53 USER_OBJECT_
RS_XML(getDefaultValiditySetting)54 RS_XML(getDefaultValiditySetting)(USER_OBJECT_ val)
55 {
56 #ifdef HAVE_VALIDITY
57 
58  extern int xmlDoValidityCheckingDefaultValue;
59  USER_OBJECT_ ans;
60  ans = NEW_INTEGER(1);
61  INTEGER_DATA(ans)[0] = xmlDoValidityCheckingDefaultValue;
62 
63   if(GET_LENGTH(val))
64      xmlDoValidityCheckingDefaultValue = INTEGER_DATA(val)[0];
65   return(ans);
66 
67 #else
68 
69   return(NEW_INTEGER(0));
70 
71 #endif
72 }
73 
74 #include <libxml/parser.h>
75 void
R_xmlStructuredErrorHandler(void * data,xmlErrorPtr err)76 R_xmlStructuredErrorHandler(void *data, xmlErrorPtr err)
77 {
78 	RSXML_structuredStop((SEXP) data, err);
79 }
80 
81 /**
82   Entry point for reading, parsing and converting an XML tree
83   to an R object.
84 
85   fileName is the string identifying the file, and is
86   expanded using the normal rules for an R file name.
87   That is, it can contain environment variables, ~, etc.
88 
89   converterFunctions is a collection of functions used to
90   map a node into an R object. This would normally
91   be a closure. It is not currently used, but will be enabled in
92   the future.
93 
94   skipBlankLines controls whether text elements consisting
95   simply of white space are included in the resulting
96   structure.
97 
98 
99   The return value is a simple list with named elements
100      file, version and children
101   The children element is itself a list consisting of
102   objects of class `XMLNode'. Each of these has the characteristic
103 
104  */
105 USER_OBJECT_
RS_XML(ParseTree)106 RS_XML(ParseTree)(USER_OBJECT_ fileName, USER_OBJECT_ converterFunctions,
107                     USER_OBJECT_ skipBlankLines, USER_OBJECT_ replaceEntities,
108                      USER_OBJECT_ asText, USER_OBJECT_ trim, USER_OBJECT_ validate,
109                       USER_OBJECT_ getDTD, USER_OBJECT_ isURL,
110                        USER_OBJECT_ addNamespaceAttributes,
111                         USER_OBJECT_ internalNodeReferences,
112 		        USER_OBJECT_ s_useHTML, USER_OBJECT_ isSchema,
113 		        USER_OBJECT_ fullNamespaceInfo, USER_OBJECT_ r_encoding,
114 		        USER_OBJECT_ useDotNames,
115       		         USER_OBJECT_ xinclude, USER_OBJECT_ errorFun,
116   	        	  USER_OBJECT_ manageMemory, USER_OBJECT_ r_parserOptions,
117                           USER_OBJECT_ r_rootFirst)
118 {
119 
120   const char *name;
121   xmlDocPtr doc;
122   USER_OBJECT_ rdoc, rdocObj; /* rdocObj is used to put the doc object
123 			       * under R's garbage collection.*/
124   USER_OBJECT_ className;
125   R_XMLSettings parserSettings;
126 
127   int asTextBuffer = LOGICAL_DATA(asText)[0];
128   int isURLDoc = LOGICAL_DATA(isURL)[0];
129   int useHTML = LOGICAL_DATA(s_useHTML)[0];
130 
131   const char *encoding = NULL;
132   int freeName = 0;
133   int parserOptions = 0;
134   int rootFirst = INTEGER(r_rootFirst)[0];
135 
136   if(GET_LENGTH(r_encoding)) {
137       encoding = CHAR(STRING_ELT(r_encoding, 0));
138       if(!encoding[0])
139 	  encoding = NULL;
140   }
141 
142   if(Rf_length(r_parserOptions))
143      parserOptions = INTEGER(r_parserOptions)[0];
144 
145   parserSettings.skipBlankLines = LOGICAL_DATA(skipBlankLines)[0];
146   parserSettings.converters = converterFunctions;
147   parserSettings.useDotNames = LOGICAL_DATA(useDotNames)[0];
148   parserSettings.trim = LOGICAL_DATA(trim)[0];
149   parserSettings.xinclude = LOGICAL_DATA(xinclude)[0];
150   parserSettings.fullNamespaceInfo = LOGICAL_DATA(fullNamespaceInfo)[0];
151 
152   parserSettings.internalNodeReferences = LOGICAL_DATA(internalNodeReferences)[0];
153 
154   parserSettings.addAttributeNamespaces = LOGICAL_DATA(addNamespaceAttributes)[0];
155   parserSettings.finalize = manageMemory;
156 
157   if(asTextBuffer == 0) {
158     struct stat tmp_stat;
159 #ifdef USE_R
160     name = CHAR(STRING_ELT(fileName, 0));
161 #else
162     name = CHARACTER_DATA(fileName)[0];
163 #endif
164     if(!isURLDoc && (name == NULL || stat(name, &tmp_stat) < 0)) {
165 	Rf_error("Can't find file %s", CHAR_DEREF(STRING_ELT(fileName, 0)) );
166     }
167   } else {
168     name = strdup(CHAR_DEREF(STRING_ELT(fileName, 0)));
169     freeName = 1;
170   }
171 
172 #if 0 /* Done in R now.*/
173     /* If one wants entities expanded directly and to appear as text.  */
174   if(LOGICAL_DATA(replaceEntities)[0])
175       xmlSubstituteEntitiesDefault(1);
176 #endif
177 
178 
179   if(LOGICAL_DATA(isSchema)[0]) {
180       xmlSchemaPtr schema = NULL;
181       xmlSchemaParserCtxtPtr ctxt;
182 
183       ctxt = xmlSchemaNewParserCtxt(name);
184       schema = xmlSchemaParse(ctxt);
185       xmlSchemaFreeParserCtxt(ctxt);
186 
187 
188 /*XXX make certain to cleanup the settings.
189   Put a finalizer on this in makeSchemaReference.
190 */
191 
192       return(makeSchemaReference(schema));
193   }
194 
195 #ifdef RS_XML_SET_STRUCTURED_ERROR
196   xmlSetStructuredErrorFunc(errorFun == NULL_USER_OBJECT ? NULL : errorFun, R_xmlStructuredErrorHandler);
197 #endif
198 
199   if(asTextBuffer) {
200       doc = useHTML ? htmlParseDoc(CHAR_TO_XMLCHAR(name), encoding) :
201 	  xmlReadMemory(name, (int)strlen(name), NULL, encoding, parserOptions) ;
202                 	  /* xmlParseMemory(name, strlen(name)) */
203 
204       if(doc != NULL)
205          doc->name = (char *) xmlStrdup(CHAR_TO_XMLCHAR("<buffer>"));
206 
207   } else {
208       doc = useHTML ? htmlParseFile(XMLCHAR_TO_CHAR(name), encoding) :
209 	              xmlReadFile(name, encoding, parserOptions) /* xmlParseFile(name) */ ;
210   }
211 
212 #ifdef RS_XML_SET_STRUCTURED_ERROR
213   xmlSetStructuredErrorFunc(NULL, NULL);
214 #endif
215 
216   if(doc == NULL) {
217       if(freeName && name) {
218 #ifdef EXPERIMENTING
219 	  free((char *) name);
220 #endif
221       }
222       /*XXX Just freed the name ! */
223       if(errorFun != NULL_USER_OBJECT) {
224         RSXML_structuredStop(errorFun, NULL);
225       } else
226         return(stop("XMLParseError", "error in creating parser for %s", name));
227 
228       Rf_error("error in creating parser for %s", name);
229   }
230 
231   if(TYPEOF(xinclude) == LGLSXP && LOGICAL_DATA(xinclude)[0]) {
232       xmlXIncludeProcessFlags(doc, XML_PARSE_XINCLUDE);
233   } else if(TYPEOF(xinclude) == INTSXP && GET_LENGTH(xinclude) > 0) {
234       xmlXIncludeProcessFlags(doc, INTEGER(xinclude)[0]);
235   }
236 
237   if(!useHTML && LOGICAL_DATA(validate)[0]) {
238       xmlValidCtxt ctxt;
239       ctxt.error = RS_XML(ValidationError);
240       ctxt.warning = RS_XML(ValidationWarning);
241 
242       if(!xmlValidateDocument(&ctxt, doc)) {
243 	  if(freeName && name)
244    	      free((char *) name);
245 
246 
247 	  Rf_error("XML document is invalid");
248       }
249   }
250 
251   if(parserSettings.internalNodeReferences) {
252       /* Use a different approach - pass internal nodes to the converter functions*/
253       if(GET_LENGTH(converterFunctions) > 0) {
254 	  xmlNodePtr root;
255 #ifdef USE_OLD_ROOT_CHILD_NAMES
256 	  root = doc->root;
257 #else
258 	  root = doc->xmlRootNode;
259 #ifdef ROOT_HAS_DTD_NODE
260 	  if(root->next && root->children == NULL)
261 	      root = root->next;
262 #endif
263 #endif
264           PROTECT(rdocObj = R_createXMLDocRef(doc));
265 	  NodeTraverse(root, converterFunctions, &parserSettings, rootFirst);
266 	  UNPROTECT(1);
267       }
268       PROTECT(rdoc = NULL_USER_OBJECT);
269   } else {
270       PROTECT(rdoc = RS_XML(convertXMLDoc)(name, doc, converterFunctions, &parserSettings));
271   }
272 
273   if(asTextBuffer && name)
274       free((char *) name);
275 
276 
277   if(!useHTML && !parserSettings.internalNodeReferences && LOGICAL_DATA(getDTD)[0]) {
278     USER_OBJECT_ ans, klass, tmp;
279     const char *names[] = {"doc", "dtd"};
280       PROTECT(ans = NEW_LIST(2));
281         SET_VECTOR_ELT(ans, 0, rdoc);
282         SET_VECTOR_ELT(ans, 1, tmp = RS_XML(ConstructDTDList)(doc, 1, NULL));
283 
284         PROTECT(klass = NEW_CHARACTER(1));
285         SET_STRING_ELT( klass, 0, mkChar("DTDList"));
286         SET_CLASS(tmp, klass);
287 
288         RS_XML(SetNames)(sizeof(names)/sizeof(names[0]), names, ans);
289 
290       UNPROTECT(2); /* release the ans */
291       rdoc = ans;
292   }
293 
294   if(parserSettings.internalNodeReferences && GET_LENGTH(converterFunctions) < 1) {
295      UNPROTECT(1);
296      return(R_createXMLDocRef(doc));
297   }
298 
299 
300   if(!parserSettings.internalNodeReferences) {
301      /* Set the class for the document. */
302     className = NEW_CHARACTER(1);
303     PROTECT(className);
304       SET_STRING_ELT(className, 0, mkChar(useHTML ? "HTMLDocument" : "XMLDocument"));
305       SET_CLASS(rdoc, className);
306     UNPROTECT(1);
307   }
308 
309 
310  UNPROTECT(1);
311  return(rdoc);
312 }
313 
314 enum { FILE_ELEMENT_NAME, VERSION_ELEMENT_NAME, CHILDREN_ELEMENT_NAME, NUM_DOC_ELEMENTS};
315 
316 
317 
318 void
NodeTraverse(xmlNodePtr root,USER_OBJECT_ converterFunctions,R_XMLSettings * parserSettings,int rootFirst)319 NodeTraverse(xmlNodePtr root, USER_OBJECT_ converterFunctions, R_XMLSettings *parserSettings, int rootFirst)
320 {
321   xmlNodePtr c, tmp;
322     c = root;
323 
324     while(c) {
325 	USER_OBJECT_ ref;
326 #ifndef USE_OLD_ROOT_CHILD_NAMES
327          tmp = c->xmlChildrenNode;
328 #else
329                c->childs;
330 #endif
331 
332         if(!rootFirst && tmp)
333 	    NodeTraverse(tmp, converterFunctions, parserSettings, rootFirst);
334 
335         PROTECT(ref = R_createXMLNodeRef(c, parserSettings->finalize));
336         convertNode(ref, c, parserSettings);
337     	UNPROTECT(1);
338 
339         if(rootFirst && tmp)
340 	    NodeTraverse(tmp, converterFunctions, parserSettings, rootFirst);
341 
342 	c = c->next;
343     }
344 }
345 
346 
347 
348 
349 
350 /**
351    Returns a named list whose elements are
352    file:  the name of the file being processed.
353    version: the XML version.
354    root: the collection of children.
355  */
356 USER_OBJECT_
RS_XML(convertXMLDoc)357 RS_XML(convertXMLDoc)(const char *fileName, xmlDocPtr doc, USER_OBJECT_ converterFunctions,
358                     R_XMLSettings *parserSettings)
359 {
360   USER_OBJECT_ rdoc;
361   USER_OBJECT_ rdoc_el_names, klass;
362   int n = NUM_DOC_ELEMENTS;
363   const char *version = "";
364   DECL_ENCODING_FROM_DOC(doc)
365 
366 
367   PROTECT(rdoc = NEW_LIST(n));
368   PROTECT(rdoc_el_names = NEW_CHARACTER(n));
369 
370     /* Insert the name of the file being processed */
371     SET_VECTOR_ELT(rdoc, FILE_ELEMENT_NAME, NEW_CHARACTER(1));
372     SET_STRING_ELT(VECTOR_ELT(rdoc, FILE_ELEMENT_NAME), 0,
373 		   ENC_COPY_TO_USER_STRING(doc->name ? (const xmlChar*)doc->name : (const xmlChar*)fileName));
374     //SET_STRING_ELT(VECTOR_ELT(rdoc, FILE_ELEMENT_NAME), 0, ENC_COPY_TO_USER_STRING(doc->name ? XMLCHAR_TO_CHAR(doc->name) : fileName));
375     SET_STRING_ELT(rdoc_el_names, FILE_ELEMENT_NAME, COPY_TO_USER_STRING("file"));
376 
377     /* Insert the XML version information */
378   SET_VECTOR_ELT(rdoc, VERSION_ELEMENT_NAME, NEW_CHARACTER(1));
379   if(doc->version)
380 	version = XMLCHAR_TO_CHAR(doc->version);
381 
382   SET_STRING_ELT(VECTOR_ELT(rdoc, VERSION_ELEMENT_NAME), 0,
383                                      COPY_TO_USER_STRING(version));
384   SET_STRING_ELT(rdoc_el_names, VERSION_ELEMENT_NAME, COPY_TO_USER_STRING("version"));
385 
386     /* Compute the nodes for this tree, recursively.
387        Note the SIDEWAYS argument to get the sibling nodes
388        at the root, rather than just the first and its children.
389      */
390 {
391   xmlNodePtr root;
392 #ifdef USE_OLD_ROOT_CHILD_NAMES
393     root = doc->root;
394 #else
395     root = doc->xmlRootNode;
396 
397 #ifdef ROOT_HAS_DTD_NODE
398     if(root->next && root->children == NULL)
399        root = root->next;
400 #endif
401 #endif
402   SET_VECTOR_ELT(rdoc, CHILDREN_ELEMENT_NAME, RS_XML(createNodeChildren)(root,  SIDEWAYS, parserSettings));
403 }
404   SET_STRING_ELT(rdoc_el_names, CHILDREN_ELEMENT_NAME, COPY_TO_USER_STRING("children"));
405 
406   SET_NAMES(rdoc, rdoc_el_names);
407 
408   PROTECT(klass = NEW_CHARACTER(1));
409   SET_STRING_ELT(klass, 0, COPY_TO_USER_STRING("XMLDocumentContent"));
410   SET_CLASS(rdoc, klass);
411 
412   UNPROTECT(3);
413 
414   return(rdoc);
415 }
416 
417 USER_OBJECT_
processNamespaceDefinitions(xmlNs * ns,xmlNodePtr node,R_XMLSettings * parserSettings)418 processNamespaceDefinitions(xmlNs *ns, xmlNodePtr node, R_XMLSettings *parserSettings)
419 {
420   int n = 0;
421   xmlNs *ptr = ns;
422   USER_OBJECT_ ans, tmp, names;
423   DECL_ENCODING_FROM_NODE(node)
424 
425   while(ptr) {
426     ptr = ptr->next;
427     n++;
428   }
429   PROTECT(ans = NEW_LIST(n));
430   PROTECT(names = NEW_CHARACTER(n));
431 
432   for(n = 0, ptr = ns; ptr ; n++, ptr = ptr->next) {
433     // protection suggested by rchk
434     tmp = PROTECT(RS_XML(createNameSpaceIdentifier)(ptr,node));
435     (void) RS_XML(notifyNamespaceDefinition)(tmp, parserSettings);
436     SET_VECTOR_ELT(ans, n, tmp);
437     UNPROTECT(1);
438     if(ptr->prefix)
439        SET_STRING_ELT(names, n, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(ptr->prefix)));
440   }
441 
442   SET_NAMES(ans, names);
443   SET_CLASS(ans, mkString("XMLNamespaceDefinitions"));
444   UNPROTECT(2);
445   return(ans);
446 }
447 
448 /**
449    Creates an R object representing the specified node, and its children
450    if recursive is non-zero. Certain types of nodes have
451 
452    direction controls whether we take the siblings of this node
453    or alternatively its children.
454 
455    parentUserNode the previously created user-leve node for the parent of the
456             target node.
457 
458  */
459 
460 enum { NODE_NAME, NODE_ATTRIBUTES, NODE_CHILDREN, NODE_NAMESPACE, NODE_NAMESPACE_DEFS, NUM_NODE_ELEMENTS};
461 
462 USER_OBJECT_
getNamespaceDefs(xmlNodePtr node,int recursive)463 getNamespaceDefs(xmlNodePtr node, int recursive)
464 {
465   USER_OBJECT_ nsDef = NULL_USER_OBJECT;
466 
467   if(node->nsDef || recursive) {
468       int numProtects = 0;
469       xmlNs *ptr = node->nsDef;
470       int n = 0;
471       while(ptr) {
472           n++;  ptr = ptr->next;
473       }
474 
475       PROTECT(nsDef = NEW_LIST(n)); numProtects++;
476       ptr = node->nsDef; n = 0;
477       while(ptr) {
478           SET_VECTOR_ELT(nsDef, n, RS_XML(createNameSpaceIdentifier)(ptr, node));
479           n++;  ptr = ptr->next;
480       }
481 
482       if(recursive && node->children) {
483 	  xmlNodePtr ptr = node->children;
484 	  USER_OBJECT_ tmp;
485 	  int i;
486 
487           PROTECT(nsDef); numProtects++;
488 	  while(ptr) {
489 	      PROTECT(tmp = getNamespaceDefs(ptr, 1));
490 /*	  nsDef = Rf_appendList(nsDef, tmp); */
491 	      if(Rf_length(tmp)) {
492 		  n = Rf_length(nsDef);
493 		  PROTECT(SET_LENGTH(nsDef, n + Rf_length(tmp)));
494 		  for(i = 0; i < Rf_length(tmp); i++)
495 		      SET_VECTOR_ELT(nsDef, n + i, VECTOR_ELT(tmp, i));
496                   UNPROTECT(3); /* old nsDef, tmp, new nsDef */
497                   PROTECT(nsDef);
498 	      } else
499    	          UNPROTECT(1); /* tmp */
500 	      ptr = ptr->next;
501 	  }
502       }
503 
504       SET_CLASS(nsDef, mkString("NamespaceDefinitionList"));
505       UNPROTECT(numProtects);
506   }
507   return(nsDef);
508 }
509 
510 USER_OBJECT_
RS_XML(internalNodeNamespaceDefinitions)511 RS_XML(internalNodeNamespaceDefinitions)(USER_OBJECT_ r_node, USER_OBJECT_ recursive)
512 {
513   xmlNodePtr node;
514 
515   if(TYPEOF(r_node) != EXTPTRSXP) {
516       Rf_error("R_internalNodeNamespaceDefinitions expects InternalXMLNode objects");
517     }
518 
519   node = (xmlNodePtr) R_ExternalPtrAddr(r_node);
520   return(getNamespaceDefs(node, LOGICAL(recursive)[0]));
521 }
522 
523 static USER_OBJECT_
RS_XML(createXMLNode)524 RS_XML(createXMLNode)(xmlNodePtr node, int recursive, int direction, R_XMLSettings *parserSettings, USER_OBJECT_ parentUserNode)
525 {
526   int n = NUM_NODE_ELEMENTS;
527   USER_OBJECT_ ans;
528   USER_OBJECT_ ans_el_names;
529   USER_OBJECT_ nsDef = NULL_USER_OBJECT;
530   int addValue;
531   DECL_ENCODING_FROM_NODE(node)
532   char *contentValue = XMLCHAR_TO_CHAR(node->content);
533 
534 #ifdef ROOT_HAS_DTD_NODE
535   if(node->type == XML_DTD_NODE)
536     return(NULL);
537 #endif
538 
539   if(parserSettings->trim) {
540     contentValue = trim(XMLCHAR_TO_CHAR(node->content));
541   }
542 
543   addValue = (contentValue && strlen(contentValue) && isBlank(contentValue) == 0);
544 
545 #ifdef LIBXML2
546   if(node->type == XML_ENTITY_DECL)
547     return(NULL);
548 #endif
549 
550         /* Drop text nodes that are blank, if that is what the user wanted. */
551   if(parserSettings->skipBlankLines && addValue == 0 && node->type == XML_TEXT_NODE)
552     return(NULL);
553 
554 
555   if(addValue)
556     n++;
557 
558 
559   /* If we have a */
560   if(node->type != XML_ELEMENT_DECL)  {
561 
562 
563      /* Create the default return value being a list of name, attributes, children
564         and possibly value.
565 
566 
567       */
568  PROTECT(ans = NEW_LIST(n));
569  PROTECT(ans_el_names = NEW_CHARACTER(n));
570 
571    /* If there are namespace definitions within this node, */
572   if(node->nsDef)  {
573     nsDef = processNamespaceDefinitions(node->nsDef, node, parserSettings);
574     SET_VECTOR_ELT(ans, NODE_NAMESPACE_DEFS, nsDef);
575   }
576 
577 
578   SET_VECTOR_ELT(ans, NODE_NAME, NEW_CHARACTER(1));
579   if(node->name)
580     SET_STRING_ELT(VECTOR_ELT(ans, NODE_NAME), 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->name)));
581 
582   SET_VECTOR_ELT(ans, NODE_ATTRIBUTES, RS_XML(AttributeList)(node, parserSettings));
583 
584   if(recursive)
585     SET_VECTOR_ELT(ans, NODE_CHILDREN, RS_XML(createNodeChildren)(node, direction, parserSettings));
586   else
587     SET_VECTOR_ELT(ans, NODE_CHILDREN, NULL_USER_OBJECT);
588 
589 
590 
591   SET_STRING_ELT(ans_el_names, NODE_NAME, mkChar("name"));
592   SET_STRING_ELT(ans_el_names, NODE_ATTRIBUTES,  mkChar("attributes"));
593   SET_STRING_ELT(ans_el_names, NODE_CHILDREN, mkChar("children"));
594   SET_STRING_ELT(ans_el_names, NODE_NAMESPACE, mkChar("namespace"));
595   SET_STRING_ELT(ans_el_names, NODE_NAMESPACE_DEFS, mkChar("namespaceDefinitions"));
596 
597   if(node->ns) {
598     PROTECT(nsDef = NEW_CHARACTER(1));
599     if(!parserSettings->fullNamespaceInfo) {
600 	if(node->ns->prefix) {
601 	    SET_STRING_ELT(nsDef, 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->ns->prefix)));
602 	    SET_CLASS(nsDef, mkString("XMLNamespacePrefix"));
603 	}
604     } else {
605 	if(node->ns->href)
606 	    SET_STRING_ELT(nsDef, 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->ns->href)));
607 	if(node->ns->prefix)
608 		SET_NAMES(nsDef, ScalarString(ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->ns->prefix)))); /* XXX change! */
609 	SET_CLASS(nsDef, mkString("XMLNamespace"));
610     }
611     SET_VECTOR_ELT(ans, NODE_NAMESPACE, nsDef);
612     UNPROTECT(1);
613   }
614 
615 
616 
617   if(addValue) {
618     SET_STRING_ELT(ans_el_names, NUM_NODE_ELEMENTS, COPY_TO_USER_STRING("value"));
619     SET_VECTOR_ELT(ans, NUM_NODE_ELEMENTS, NEW_CHARACTER(1));
620     SET_STRING_ELT(VECTOR_ELT(ans, NUM_NODE_ELEMENTS), 0, ENC_COPY_TO_USER_STRING(contentValue));
621 
622     if(node->type == XML_ENTITY_REF_NODE)
623 	    SET_NAMES(VECTOR_ELT(ans, NUM_NODE_ELEMENTS), ScalarString(ENC_COPY_TO_USER_STRING(node->name)));
624   }
625 
626   SET_NAMES(ans, ans_el_names);
627 
628 
629     /* Compute the class of this object based on the type in the
630        XML node.
631      */
632 
633   RS_XML(setNodeClass)(node, ans);
634   } else {
635       /* XML_ELEMENT_DECL */
636       ans = NULL_USER_OBJECT;
637       PROTECT(ans);
638       PROTECT(ans);
639   }
640      /* Now invoke any user-level converters.  */
641   if(recursive || direction)
642     ans = convertNode(ans, node, parserSettings);
643 
644   UNPROTECT(1);
645   UNPROTECT(1);
646   return(ans);
647 }
648 
649 static USER_OBJECT_
convertNode(USER_OBJECT_ ans,xmlNodePtr node,R_XMLSettings * parserSettings)650 convertNode(USER_OBJECT_ ans, xmlNodePtr node, R_XMLSettings *parserSettings)
651 {
652     USER_OBJECT_ val = ans;
653 
654   if(parserSettings != NULL) {
655     USER_OBJECT_  fun = NULL;
656     const char *funName;
657 
658     if(parserSettings->xinclude && (node->type == XML_XINCLUDE_START || node->type == XML_XINCLUDE_END)) {
659 	return(NULL);
660     }
661 
662 
663        if(node->name) {
664           funName = XMLCHAR_TO_CHAR(node->name);
665           fun = RS_XML(findFunction)(funName, parserSettings->converters);
666        }
667 
668       if(fun == NULL) {
669 	/* Didn't find the tag-specific function in the handlers.
670            So see if there is one for this type node.
671          */
672         fun = RS_XML(lookupGenericNodeConverter)(node, ans, parserSettings);
673       }
674       if(fun != NULL) {
675         USER_OBJECT_ opArgs = NEW_LIST(1);
676 	 PROTECT(opArgs);
677 	 SET_VECTOR_ELT(opArgs, 0, ans);
678 	 val = RS_XML(invokeFunction)(fun, opArgs, NULL, NULL);
679    	 UNPROTECT(1);
680       }
681   }
682   return(val);
683 }
684 
685 
686 const char * const XMLNodeClassHierarchy[] = {"XMLNode", "RXMLAbstractNode", "XMLAbstractNode", "oldClass"};
687 
688 int
RS_XML(setNodeClass)689 RS_XML(setNodeClass)(xmlNodePtr node, USER_OBJECT_ ans)
690 {
691  char *className = NULL;
692  int numEls = 1;
693  int lenHier = sizeof(XMLNodeClassHierarchy)/sizeof(XMLNodeClassHierarchy[0]);
694 
695  numEls = lenHier + 1;
696 
697   switch(node->type) {
698     case XML_ENTITY_REF_NODE:
699       className = "XMLEntityRef";
700       break;
701     case XML_PI_NODE:
702       className = "XMLProcessingInstruction";
703       break;
704     case XML_COMMENT_NODE:
705       className = "XMLCommentNode";
706       break;
707     case XML_TEXT_NODE:
708       className = "XMLTextNode";
709       break;
710     case XML_CDATA_SECTION_NODE:
711       className = "XMLCDataNode";
712       break;
713 #ifdef LIBXML2
714     case XML_ENTITY_DECL:
715       className = "XMLEntityDeclaration";
716       break;
717 #endif
718    default:
719      numEls--;
720      break;
721   }
722 
723   if(1) {
724      USER_OBJECT_ Class;
725      int ctr = 0, i;
726      PROTECT(Class = NEW_CHARACTER(numEls));
727         if(className)
728             SET_STRING_ELT(Class, ctr++, mkChar(className));
729 
730         for(i = 0; i < lenHier; i++)
731           SET_STRING_ELT(Class, ctr++, mkChar(XMLNodeClassHierarchy[i]));
732         SET_CLASS(ans, Class);
733       UNPROTECT(1);
734   }
735 
736   return(node->type);
737 }
738 
739 
740 const char *RS_XML(NameSpaceSlotNames)[] = {"id", "uri", "local"};
741 enum {NAMESPACE_PREFIX_SLOT, NAMESPACE_URI_SLOT, NAMESPACE_TYPE_SLOT, NAMESPACE_NUM_SLOTS};
742 
743 /**
744   Create a local object identifying the name space used by a particular node.
745   This is not the name space definition which would have a URL/URI and a type.
746  */
747 USER_OBJECT_
RS_XML(createNameSpaceIdentifier)748 RS_XML(createNameSpaceIdentifier)(xmlNs *space, xmlNodePtr node)
749 {
750 
751  USER_OBJECT_ ans;
752  DECL_ENCODING_FROM_NODE(node)
753 
754  if(node->nsDef) {
755    PROTECT(ans = NEW_LIST(3));
756      SET_VECTOR_ELT(ans, NAMESPACE_PREFIX_SLOT, NEW_CHARACTER(1));
757      SET_STRING_ELT(VECTOR_ELT(ans, NAMESPACE_PREFIX_SLOT), 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR( (space->prefix ? space->prefix : (xmlChar*)""))));
758 
759      SET_VECTOR_ELT(ans, NAMESPACE_URI_SLOT, NEW_CHARACTER(1));
760      SET_STRING_ELT(VECTOR_ELT(ans, NAMESPACE_URI_SLOT), 0, space->href ? ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(space->href)) : NA_STRING);
761 
762 
763      SET_VECTOR_ELT(ans, NAMESPACE_TYPE_SLOT, NEW_LOGICAL(1));
764      LOGICAL_DATA(VECTOR_ELT(ans, NAMESPACE_TYPE_SLOT))[0] = (space->type == XML_LOCAL_NAMESPACE);
765 
766      RS_XML(SetNames)(NAMESPACE_NUM_SLOTS, RS_XML(NameSpaceSlotNames), ans);
767 
768    {
769     USER_OBJECT_ klass;
770     PROTECT(klass = NEW_CHARACTER(1));
771      SET_STRING_ELT(klass, 0, COPY_TO_USER_STRING("XMLNamespaceDefinition"));
772      SET_CLASS(ans, klass);
773     UNPROTECT(1);
774    }
775    UNPROTECT(1);
776  } else {
777    PROTECT(ans =  NEW_CHARACTER(1));
778    if(space->prefix)
779       SET_STRING_ELT(ans, 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(space->prefix)));
780    UNPROTECT(1);
781  }
782 
783 
784  return(ans);
785 }
786 
787 /**
788   Attempt to find a function in the handler methods corresponding to the
789   type of the node, not its specific tag name.
790  */
791 USER_OBJECT_
RS_XML(lookupGenericNodeConverter)792 RS_XML(lookupGenericNodeConverter)(xmlNodePtr node, USER_OBJECT_ defaultNodeValue,
793                                      R_XMLSettings *parserSettings)
794 {
795 #define DOT(x) parserSettings->useDotNames ? "." x : x
796   char *name;
797   USER_OBJECT_ fun = NULL;
798   switch(node->type) {
799     case XML_ENTITY_REF_NODE:
800 	name = DOT("entity");
801       break;
802     case XML_ENTITY_NODE:
803 	name = DOT("entity");
804       break;
805     case XML_ELEMENT_NODE:
806 	name = DOT("startElement");
807       break;
808     case XML_PI_NODE:
809 	name = DOT("proccesingInstruction");
810       break;
811     case XML_COMMENT_NODE:
812 	name = DOT("comment");
813       break;
814     case XML_TEXT_NODE:
815 	name = DOT("text");
816       break;
817     case XML_CDATA_SECTION_NODE:
818 	name = DOT("cdata");
819       break;
820   default:
821       name = NULL;
822   }
823 
824   if(name && name[0])
825    fun = RS_XML(findFunction)(name, parserSettings->converters);
826 
827  return(fun);
828 }
829 
830 /*
831  XXX Unravel this recursive call into a loop.
832 
833   Starting at the top node, fix the id to be empty.
834   Then add the node and get the ID.
835   Then loop over the children, and the node and call the routine
836   on its children
837 
838 
839  */
840 
841 /*
842    at a given node, make the node
843 */
844 void
addNodeAndChildrenToTree(xmlNodePtr node,SEXP id,SEXP e,R_XMLSettings * parserSettings,int * ctr)845 addNodeAndChildrenToTree(xmlNodePtr node, SEXP id, SEXP e, R_XMLSettings *parserSettings, int *ctr)
846 {
847    SEXP tmp;
848    xmlNodePtr n;
849 
850    if(!node)
851      return;
852 
853         /* Create a skeleton node with no children. */
854    tmp = RS_XML(createXMLNode)(node, 0, 0/* doesn't matter */, parserSettings, R_NilValue);/*XXX*/
855    if(!tmp)
856      return;
857    SETCAR(CDR(e), tmp);
858    (*ctr)++;
859 
860    id = Rf_eval(e, R_GlobalEnv);
861    PROTECT(id);
862 
863    n = node->children;
864    while(n) {
865 
866      SETCAR(CDR(CDR(e)), id);
867      addNodeAndChildrenToTree(n, id, e, parserSettings, ctr);
868      (*ctr)++;
869      n = n->next;
870    }
871 
872    UNPROTECT(1);
873 }
874 
875 
876 
877 SEXP
addNodesToTree(xmlNodePtr node,R_XMLSettings * parserSettings)878 addNodesToTree(xmlNodePtr node, R_XMLSettings *parserSettings)
879 {
880    xmlNodePtr ptr = node;
881    SEXP e, id;
882    int ctr = 0;
883    PROTECT(e = allocVector(LANGSXP, 3));
884    SETCAR(e, parserSettings->converters);
885    PROTECT(id = NEW_CHARACTER(0));
886 
887    ptr = node;
888 
889    /* loop over the sibling nodes here in case we have multiple roots,
890       e.g. a comment, PI and a real node. See xysize.svg
891     */
892    while(ptr) {
893       SETCAR(CDR(CDR(e)), id);
894       addNodeAndChildrenToTree(ptr, id, e, parserSettings, &ctr);
895       ptr = ptr->next;
896    }
897 
898    UNPROTECT(2); /* e, id */
899    return(ScalarInteger(ctr));
900 }
901 
902 
903 
904 /**
905   Creates the R objects representing the children or siblings of the specified
906   node, handling simple text cases with no children, as well as recursively
907   processing the children.
908 
909   node   the node whose children or siblings should be converted.
910 
911   direction DOWN or SIDEWAYS indicating the children or siblings should
912     be processed, respectively. If SIDEWAYS is specified, the node itself
913     is included in the result.
914 
915   parserSettings  "global" information about the parsing conversion for the duration of the parser.
916 
917 
918   Return  list of XMLNode objects.
919  */
920 USER_OBJECT_
RS_XML(createNodeChildren)921 RS_XML(createNodeChildren)(xmlNodePtr node, int direction, R_XMLSettings *parserSettings)
922 {
923   int n = 0, i;
924   USER_OBJECT_ ans = NULL_USER_OBJECT;
925   USER_OBJECT_ elNames = NULL;
926   int unProtect = 0;
927   xmlNodePtr base, c = (direction == SIDEWAYS) ? node :
928 #ifndef USE_OLD_ROOT_CHILD_NAMES
929                               node->xmlChildrenNode;
930 #else
931                               node->childs;
932 #endif
933 
934   DECL_ENCODING_FROM_NODE(node)
935 
936   base = c;
937 
938   if(IS_FUNCTION(parserSettings->converters)) {
939     return(addNodesToTree(node, parserSettings));
940   }
941 
942       /* Count the number of elements being converted. */
943   while(c) {
944     c = c->next;
945     n++;
946   }
947 
948   if(n > 0) {
949     USER_OBJECT_ tmp;
950     USER_OBJECT_ tmpNames;
951     int count = 0;
952 
953 
954     c = base;
955 
956     PROTECT(ans = NEW_LIST(n));
957     PROTECT(elNames = NEW_CHARACTER(n));
958 
959     unProtect = 2;
960 
961     for(i = 0; i < n; i++, c = c->next) {
962 	tmp = RS_XML(createXMLNode)(c, 1, DOWN, parserSettings, ans);
963 	if(tmp && tmp != NULL_USER_OBJECT) {
964 	    SET_VECTOR_ELT(ans, count, tmp);
965 	    if(c->name)
966 		SET_STRING_ELT(elNames, count, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(c->name)));
967             count++;
968 	}
969     }
970 
971      if(count < n) {
972       /* Reset the length! */
973 #ifdef USE_S
974 #else
975       PROTECT(tmp  = NEW_LIST(count));
976       PROTECT(tmpNames = NEW_CHARACTER(count));
977       for(i = 0 ;  i < count ; i++) {
978         SET_VECTOR_ELT(tmp, i, VECTOR_ELT(ans, i));
979         SET_STRING_ELT(tmpNames, i, STRING_ELT(elNames, i));
980       }
981       ans = tmp;
982       SET_NAMES(ans, tmpNames);
983       UNPROTECT(4);
984       PROTECT(ans);
985       unProtect = 1;
986 #endif
987      } else {
988          SET_NAMES(ans, elNames);
989      }
990 
991     if(unProtect > 0)
992        UNPROTECT(unProtect);
993   }
994 
995   return(ans);
996 }
997 
998 
999 
1000 USER_OBJECT_
RS_XML(notifyNamespaceDefinition)1001 RS_XML(notifyNamespaceDefinition)(USER_OBJECT_ arg, R_XMLSettings *parserSettings)
1002 {
1003  USER_OBJECT_ fun, ans = NULL_USER_OBJECT;
1004 
1005      fun = RS_XML(findFunction)("namespace", parserSettings->converters);
1006      if(fun != NULL) {
1007         USER_OBJECT_ opArgs = NEW_LIST(1);
1008         USER_OBJECT_ tmp;
1009 	 PROTECT(opArgs);
1010 	 SET_VECTOR_ELT(opArgs, 0, arg);
1011 	 tmp = RS_XML(invokeFunction)(fun, opArgs, NULL, NULL);
1012          ans = tmp;
1013    	 UNPROTECT(1);
1014       }
1015 
1016  return(ans);
1017 }
1018 
1019 #ifdef USE_XML_VERSION_H
1020 #ifndef LIBXML_TEST_VERSION
1021 #include <libxml/xmlversion.h>
1022 #endif
1023 #endif
1024 
1025 USER_OBJECT_
RS_XML(libxmlVersion)1026 RS_XML(libxmlVersion)()
1027 {
1028  USER_OBJECT_ ans;
1029  unsigned int val;
1030 
1031 #ifdef LIBXML_VERSION_NUMBER
1032  val = LIBXML_VERSION_NUMBER;
1033 #else
1034 #ifdef LIBXML_VERSION
1035  val = LIBXML_VERSION;
1036 #else
1037  val = 0;
1038 #endif
1039 #endif
1040 
1041  ans = NEW_NUMERIC(1);
1042  NUMERIC_DATA(ans)[0] = val;
1043  return(ans);
1044 }
1045 
1046 
1047 
1048 static
1049 void
notifyError(const char * msg,va_list ap,Rboolean isError)1050 notifyError(const char *msg, va_list ap, Rboolean isError)
1051 {
1052 #if 0
1053     if(isError) {
1054 	Rf_error("error in validating XML document");
1055     } else {
1056 	Rf_error("warning when validating XML document");
1057     }
1058 
1059 #else
1060 #define BUFSIZE 2048
1061     char buf[BUFSIZE];
1062 
1063     memset(buf, '\0', BUFSIZE);
1064     vsnprintf(buf, BUFSIZE, msg, ap);
1065 
1066     Rf_warning(buf);
1067 #endif
1068 }
1069 
1070 
1071 
1072 void
RS_XML(ValidationError)1073 RS_XML(ValidationError)(void *ctx, const char *format, ...)
1074 {
1075   char *msg = "Message unavailable";
1076   va_list(ap);
1077   va_start(ap, format);
1078 
1079   if(strcmp(format, "%s") == 0)
1080     msg = va_arg(ap, char *);
1081 
1082   va_end(ap);
1083   notifyError(msg, ap, TRUE);
1084 }
1085 
1086 void
RS_XML(ValidationWarning)1087 RS_XML(ValidationWarning)(void *ctx, const char *format, ...)
1088 {
1089   char *msg = "Message unavailable";
1090   va_list(ap);
1091   va_start(ap, format);
1092 
1093   if(strcmp(format, "%s") == 0)
1094     msg = va_arg(ap, char *);
1095 
1096   va_end(ap);
1097   notifyError(msg, ap, FALSE);
1098 }
1099 
1100 
1101 USER_OBJECT_
R_createXMLNode(USER_OBJECT_ snode,USER_OBJECT_ handlers,USER_OBJECT_ r_trim,USER_OBJECT_ r_skipBlankLines)1102 R_createXMLNode(USER_OBJECT_ snode, USER_OBJECT_ handlers, USER_OBJECT_ r_trim, USER_OBJECT_ r_skipBlankLines)
1103 {
1104     xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1105     R_XMLSettings parserSettings;
1106 
1107     parserSettings.converters = handlers;
1108     parserSettings.trim = LOGICAL(r_trim)[0];
1109     parserSettings.skipBlankLines = LOGICAL(r_skipBlankLines)[0];
1110 
1111     return(RS_XML(createNodeChildren)(node, SIDEWAYS, &parserSettings));
1112 }
1113 
1114 
1115 
1116 USER_OBJECT_
RS_XML_xmlNodeName(USER_OBJECT_ snode)1117 RS_XML_xmlNodeName(USER_OBJECT_ snode)
1118 {
1119     xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1120     USER_OBJECT_ ans;
1121     DECL_ENCODING_FROM_NODE(node)
1122 
1123 
1124     PROTECT(ans = NEW_CHARACTER(1));
1125     SET_STRING_ELT(ans, 0, node->name ? ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(node->name)) : R_NaString);
1126     UNPROTECT(1);
1127     return(ans);
1128 }
1129 
1130 
1131 USER_OBJECT_
RS_XML_xmlNodeNamespace(USER_OBJECT_ snode)1132 RS_XML_xmlNodeNamespace(USER_OBJECT_ snode)
1133 {
1134     xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1135     USER_OBJECT_ ans;
1136     xmlNs *ns;
1137     DECL_ENCODING_FROM_NODE(node)
1138 
1139     ns = node->ns;
1140     if(!ns)
1141 	return(NEW_CHARACTER(0));
1142 
1143     PROTECT(ans = NEW_CHARACTER(1));
1144     if(ns->href)
1145         SET_STRING_ELT(ans, 0, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(ns->href)));
1146     if(ns->prefix)
1147         SET_NAMES(ans, ScalarString(ENC_COPY_TO_USER_STRING(ns->prefix)));
1148 
1149     SET_CLASS(ans, mkString("XMLNamespace"));
1150     UNPROTECT(1);
1151     return(ans);
1152 }
1153 
1154 enum  {
1155     R_XML_NS_ADD_PREFIX = 1,
1156     R_XML_NS_ADD_URL_DEFS = 2
1157 };
1158 
1159 USER_OBJECT_
RS_XML_xmlNodeAttributes(USER_OBJECT_ snode,USER_OBJECT_ addNamespaces,USER_OBJECT_ addNamespaceURLs)1160 RS_XML_xmlNodeAttributes(USER_OBJECT_ snode, USER_OBJECT_ addNamespaces, USER_OBJECT_ addNamespaceURLs)
1161 {
1162     xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1163     R_XMLSettings parserSettings;
1164     parserSettings.addAttributeNamespaces = 0;
1165     if(LOGICAL_DATA(addNamespaces)[0])
1166 	parserSettings.addAttributeNamespaces |= R_XML_NS_ADD_PREFIX;
1167     if(LOGICAL_DATA(addNamespaceURLs)[0])
1168 	parserSettings.addAttributeNamespaces |= R_XML_NS_ADD_URL_DEFS;
1169 
1170     return(RS_XML(AttributeList)(node, &parserSettings));
1171 }
1172 
1173 USER_OBJECT_
RS_XML_xmlNodeParent(USER_OBJECT_ snode,USER_OBJECT_ manageMemory)1174 RS_XML_xmlNodeParent(USER_OBJECT_ snode, USER_OBJECT_ manageMemory)
1175 {
1176     xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1177     if(node->parent && (node->parent->type == XML_DOCUMENT_NODE || node->parent->type == XML_HTML_DOCUMENT_NODE))
1178 	return(NULL_USER_OBJECT);
1179     return(R_createXMLNodeRef(node->parent, manageMemory));
1180 }
1181 
1182 
1183 USER_OBJECT_
RS_XML_xmlNodeNumChildren(USER_OBJECT_ snode)1184 RS_XML_xmlNodeNumChildren(USER_OBJECT_ snode)
1185 {
1186     xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1187     int count = 0;
1188     xmlNodePtr ptr =  node->children;
1189 
1190     while(ptr) {
1191 	count++;
1192 	ptr = ptr->next;
1193     }
1194     return(ScalarInteger(count));
1195 }
1196 
1197 
1198 USER_OBJECT_
RS_XML_xmlNodeChildrenReferences(USER_OBJECT_ snode,USER_OBJECT_ r_addNames,USER_OBJECT_ manageMemory)1199 RS_XML_xmlNodeChildrenReferences(USER_OBJECT_ snode, USER_OBJECT_ r_addNames, USER_OBJECT_ manageMemory)
1200 {
1201     xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1202     USER_OBJECT_ ans, names = R_NilValue;
1203     int count = 0, i;
1204     xmlNodePtr ptr =  node->children;
1205     int addNames = LOGICAL(r_addNames)[0];
1206     DECL_ENCODING_FROM_NODE(node)
1207     int nprot = 0;
1208 
1209     while(ptr) {
1210 	count++;
1211 	ptr = ptr->next;
1212     }
1213 
1214     ptr = node->children;
1215 
1216     PROTECT(ans = NEW_LIST(count)); nprot++;
1217     if(addNames) {
1218 	PROTECT(names = NEW_CHARACTER(count));
1219 	nprot++;
1220     }
1221 
1222     for(i = 0; i < count ; i++, ptr = ptr->next) {
1223 	SET_VECTOR_ELT(ans, i, R_createXMLNodeRef(ptr, manageMemory));
1224 	if(addNames)
1225 	    SET_STRING_ELT(names, i, ENC_COPY_TO_USER_STRING(ptr->name ?  ptr->name : (const xmlChar *)""));
1226     }
1227     if(addNames)
1228 	SET_NAMES(ans, names);
1229     UNPROTECT(nprot);
1230 
1231     return(ans);
1232 }
1233 
1234 USER_OBJECT_
R_getNodeChildByIndex(USER_OBJECT_ snode,USER_OBJECT_ r_index,USER_OBJECT_ manageMemory)1235 R_getNodeChildByIndex(USER_OBJECT_ snode, USER_OBJECT_ r_index, USER_OBJECT_ manageMemory)
1236 {
1237     xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(snode);
1238     int count = 0, num;
1239     xmlNodePtr ptr = node->children;
1240 
1241     num = INTEGER(r_index)[0] - 1;
1242     if(num < 0) {
1243 	Rf_error("cannot index an internal node with a negative number %d", num);
1244     }
1245 
1246 
1247     while(ptr && count < num) {
1248 	count++;
1249 	ptr = ptr->next;
1250     }
1251 
1252     return(ptr ? R_createXMLNodeRef(ptr, manageMemory) : NULL_USER_OBJECT);
1253 }
1254 
1255 
1256 
1257 static USER_OBJECT_
makeSchemaReference(xmlSchemaPtr schema)1258 makeSchemaReference(xmlSchemaPtr schema)
1259 {
1260     return(R_makeRefObject(schema, "xmlSchemaRef"));
1261 /*
1262     USER_OBJECT_ ans;
1263     PROTECT(ans = R_MakeExternalPtr(schema, Rf_install("XMLSchema"), R_NilValue));
1264     SET_CLASS(ans, mkString("XMLSchema"));
1265     UNPROTECT(1);
1266     return(ans);
1267 */
1268 }
1269 
1270 // unused
1271 #define NO_XML_MEMORY_SHOW_ROUTINE 1
1272 
1273 #ifndef NO_XML_MEMORY_SHOW_ROUTINE
1274 void
RS_XML_MemoryShow()1275 RS_XML_MemoryShow()
1276 {
1277     xmlMemDisplay(stderr);
1278 }
1279 #endif
1280 
1281 USER_OBJECT_
RS_XML_setDocumentName(USER_OBJECT_ sdoc,USER_OBJECT_ sname)1282 RS_XML_setDocumentName(USER_OBJECT_ sdoc, USER_OBJECT_ sname)
1283 {
1284     /* if doc is NULL in C , return NULL in R
1285        If doc->name is NULL in C, return NA
1286        Otherwise,  return the string.
1287        */
1288     xmlDocPtr doc = (xmlDocPtr) R_ExternalPtrAddr(sdoc);
1289 
1290     if(!doc) {
1291 	Rf_warning("NULL pointer supplied for internal document");
1292 	return(R_NilValue);
1293     }
1294 
1295     doc->URL = xmlStrdup(CHAR_TO_XMLCHAR(CHAR_DEREF(STRING_ELT(sname, 0))));
1296 
1297     return(sdoc);
1298 }
1299 
1300 
1301 
1302 USER_OBJECT_
RS_XML_getDocumentName(USER_OBJECT_ sdoc)1303 RS_XML_getDocumentName(USER_OBJECT_ sdoc)
1304 {
1305     /* if doc is NULL in C , return NULL in R
1306        If doc->name is NULL in C, return NA
1307        Otherwise,  return the string.
1308        */
1309     xmlDocPtr doc = (xmlDocPtr) R_ExternalPtrAddr(sdoc);
1310     USER_OBJECT_ ans;
1311     const xmlChar *encoding;
1312     if(!doc) {
1313 	Rf_warning("NULL pointer supplied for internal document");
1314 	return(R_NilValue);
1315     }
1316     encoding = doc->encoding;
1317     PROTECT(ans = NEW_CHARACTER(1));
1318     SET_STRING_ELT(ans, 0, doc->URL ? ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(doc->URL)) : R_NaString);
1319     UNPROTECT(1);
1320     return(ans);
1321 }
1322 
1323 
1324 USER_OBJECT_
RS_XML_setKeepBlanksDefault(USER_OBJECT_ val)1325 RS_XML_setKeepBlanksDefault(USER_OBJECT_ val)
1326 {
1327     int prev;
1328     prev = xmlKeepBlanksDefault(INTEGER(val)[0]);
1329     return(ScalarInteger(prev));
1330 }
1331 
1332 
1333 USER_OBJECT_
RS_XML_xmlXIncludeProcessFlags(USER_OBJECT_ r_doc,USER_OBJECT_ r_flags)1334 RS_XML_xmlXIncludeProcessFlags(USER_OBJECT_ r_doc, USER_OBJECT_ r_flags)
1335 {
1336     xmlDocPtr doc = (xmlDocPtr) R_ExternalPtrAddr(r_doc);
1337     int ans;
1338 
1339     ans = xmlXIncludeProcessFlags(doc, INTEGER(r_flags)[0]);
1340     return(ScalarInteger(ans));
1341 }
1342 
1343 USER_OBJECT_
RS_XML_xmlXIncludeProcessTreeFlags(USER_OBJECT_ r_node,USER_OBJECT_ r_flags)1344 RS_XML_xmlXIncludeProcessTreeFlags(USER_OBJECT_ r_node, USER_OBJECT_ r_flags)
1345 {
1346     xmlNodePtr node;
1347     int flags = INTEGER(r_flags)[0];
1348     int n;
1349     //xmlNodePtr prev, parent;
1350     SEXP ans = R_NilValue;
1351 
1352     node = (xmlNodePtr) R_ExternalPtrAddr(r_node);
1353     //prev = node->prev;
1354     //parent = node->parent;
1355 
1356     n = xmlXIncludeProcessTreeFlags(node, flags);
1357 
1358     if(n == 0)
1359 	return(R_NilValue);
1360     else if(n == -1) {
1361 	Rf_error("failed in XInclude");
1362     }
1363 
1364 #if 0
1365     if(!prev)  {
1366 	fprintf(stderr, "Adding to children of %s\n", prev->name);
1367         prev = parent->children;
1368     } else {
1369 	fprintf(stderr, "Adding after  %s\n", prev->name);
1370 	prev = prev->next;
1371     }
1372 
1373     prev = node->next;
1374 
1375     PROTECT(ans = NEW_LIST(n));
1376     for(i = 0; i < n; i++) {
1377 	SET_VECTOR_ELT(ans, i, prev ? R_createXMLNodeRef(prev) : R_NilValue);
1378 	prev = prev->next;
1379     }
1380     UNPROTECT(1);
1381 #endif
1382 
1383     return(ans);
1384 }
1385 
1386 
1387 
1388 /**
1389    Create an R named list containing the attributes of the specified node.
1390  */
1391 
1392 /*
1393    We could use the CONS mechanism rather than doing a double pass.
1394    Not certain what is quicker in this situation. Also, doesn't
1395    work that way in S4, so keep it this way.
1396 */
1397 USER_OBJECT_
RS_XML(AttributeList)1398 RS_XML(AttributeList)(xmlNodePtr node, R_XMLSettings *parserSettings)
1399 {
1400   USER_OBJECT_ ans = NULL_USER_OBJECT;
1401   USER_OBJECT_ ans_names;
1402   xmlAttr * atts;
1403   const xmlChar *encoding = node->doc ? node->doc->encoding : NULL;
1404 
1405   int n = 0, i;
1406 
1407       /* Count the number of attributes*/
1408     atts = node->properties;
1409 
1410     while(atts) {
1411       n++;
1412       atts = atts->next;
1413     }
1414 
1415   if(n > 0) {
1416     SEXP ans_namespaces, ans_namespaceDefs;
1417     int nonTrivialAttrNamespaces = 0;
1418     int addNSPrefix = parserSettings->addAttributeNamespaces & R_XML_NS_ADD_PREFIX;
1419     int retNSDefs = parserSettings->addAttributeNamespaces & R_XML_NS_ADD_URL_DEFS;
1420 
1421     PROTECT(ans = NEW_CHARACTER(n));
1422     PROTECT(ans_names = NEW_CHARACTER(n));
1423     PROTECT(ans_namespaces = NEW_CHARACTER(n));
1424     PROTECT(ans_namespaceDefs = NEW_CHARACTER(retNSDefs ? n : 0));
1425 
1426          /* Loop over the attributes and create the string elements
1427             and the elements of the name vector.
1428           */
1429 
1430       atts = node->properties;
1431 
1432       for(i=0; i < n ; i++) {
1433 	/* Have to be careful that atts->val and atts->val->context are non-null. Something like
1434            <a href=""> kills it otherwise.
1435          */
1436 #ifdef LIBXML2
1437          SET_STRING_ELT(ans, i,
1438                          ENC_COPY_TO_USER_STRING(
1439                                        XMLCHAR_TO_CHAR(
1440                                           ((atts->xmlChildrenNode != (xmlNode*)NULL && atts->xmlChildrenNode->content != (xmlChar*)NULL )
1441                                                        ? atts->xmlChildrenNode->content : (xmlChar*)""))));
1442 #else
1443          SET_STRING_ELT(ans, i, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(((atts->val != (xmlNode*)NULL && atts->val->content != (xmlChar*)NULL )
1444                                                             ? atts->val->content : (xmlChar*)""))));
1445 
1446 #endif
1447          if(atts->name) {
1448            if(addNSPrefix && atts->ns && atts->ns->prefix) {
1449              char buf[400];
1450              sprintf(buf, "%s:%s", atts->ns->prefix, atts->name);
1451              SET_STRING_ELT(ans_names, i, ENC_COPY_TO_USER_STRING(buf));
1452 	   } else
1453              SET_STRING_ELT(ans_names, i, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(atts->name)));
1454 
1455 	   if((addNSPrefix | retNSDefs) && atts->ns && atts->ns->prefix) {
1456 	     SET_STRING_ELT(ans_namespaces, i, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(atts->ns->prefix)));
1457 
1458 	     if(retNSDefs)
1459     	         SET_STRING_ELT(ans_namespaceDefs, i, ENC_COPY_TO_USER_STRING(XMLCHAR_TO_CHAR(atts->ns->href)));
1460 
1461 	     nonTrivialAttrNamespaces++;
1462 	   }
1463 	 }
1464 
1465          atts = atts->next;
1466      }
1467 
1468    if(nonTrivialAttrNamespaces) {
1469        if(retNSDefs)
1470           Rf_setAttrib(ans_namespaces, Rf_install("names"), ans_namespaceDefs);
1471        Rf_setAttrib(ans, Rf_install("namespaces"), ans_namespaces);
1472    }
1473     SET_NAMES(ans, ans_names);
1474 
1475     UNPROTECT(4);
1476    }
1477 #if 0
1478    else
1479       ans = NEW_CHARACTER(0);
1480 #endif
1481 
1482   return(ans);
1483 }
1484 
1485 
1486 
1487 SEXP
R_getDocEncoding(SEXP r_doc)1488 R_getDocEncoding(SEXP r_doc)
1489 {
1490     xmlDocPtr doc = (xmlDocPtr) R_ExternalPtrAddr(r_doc);
1491 
1492     const xmlChar *encoding;
1493     SEXP ans;
1494 
1495     if(doc->type != XML_DOCUMENT_NODE && doc->type != XML_HTML_DOCUMENT_NODE)
1496 	doc = ((xmlNodePtr) doc)->doc;
1497     if(!doc)
1498 	return(NEW_CHARACTER(0));
1499 
1500     encoding = doc->encoding;
1501     PROTECT(ans = NEW_CHARACTER(1));
1502     SET_STRING_ELT(ans, 0, encoding ? CreateCharSexpWithEncoding(doc->encoding, doc->encoding) : R_NaString);
1503     UNPROTECT(1);
1504 
1505     return(ans);
1506 }
1507 
1508 
1509 int
getTextElementLineNumber(xmlNodePtr node)1510 getTextElementLineNumber(xmlNodePtr node)
1511 {
1512     int val = -1;
1513 
1514     if(node->parent)
1515 	val = node->parent->line;
1516 
1517     xmlNodePtr prev = node->prev;
1518     while(prev) {
1519 	if(prev->line > 0) {
1520 	    val = prev->line;
1521 	    break;
1522 	}
1523 	prev = prev->prev;
1524     }
1525     return(val);
1526 }
1527 
1528 SEXP
R_getLineNumber(SEXP r_node)1529 R_getLineNumber(SEXP r_node)
1530 {
1531     xmlNodePtr node = (xmlNodePtr) R_ExternalPtrAddr(r_node);
1532 
1533     if(!node) {
1534 	return(NEW_INTEGER(0));
1535     }
1536 
1537 //    XML_GET_LINE(node)
1538     return(ScalarInteger(node->line == 0 ?
1539 			 getTextElementLineNumber(node) : node->line));
1540 }
1541 
1542 
1543 SEXP
R_xmlReadFile(SEXP r_filename,SEXP r_encoding,SEXP r_options)1544 R_xmlReadFile(SEXP r_filename, SEXP r_encoding, SEXP r_options) //, SEXP manageMemory)
1545 {
1546     const char *filename;
1547     const char *encoding = NULL;
1548     int options;
1549     xmlDocPtr doc;
1550 
1551     filename = CHAR_DEREF(STRING_ELT(r_filename, 0));
1552     if(Rf_length(r_encoding))
1553        encoding = CHAR_DEREF(STRING_ELT(r_encoding, 0));
1554     options = INTEGER(r_options)[0];
1555 
1556     doc = xmlReadFile(filename, encoding, options);
1557     return(R_createXMLDocRef(doc));
1558 }
1559 
1560 SEXP
R_xmlReadMemory(SEXP r_txt,SEXP len,SEXP r_encoding,SEXP r_options,SEXP r_base)1561 R_xmlReadMemory(SEXP r_txt, SEXP len, SEXP r_encoding, SEXP r_options, SEXP r_base) //, SEXP manageMemory)
1562 {
1563     const char *txt;
1564     const char *encoding = NULL;
1565     const char *baseURL = NULL;
1566     int options;
1567     xmlDocPtr doc;
1568 
1569     txt = CHAR_DEREF(STRING_ELT(r_txt, 0));
1570     if(Rf_length(r_encoding))
1571        encoding = CHAR_DEREF(STRING_ELT(r_encoding, 0));
1572     options = INTEGER(r_options)[0];
1573 
1574     if(Rf_length(r_base))
1575 	baseURL = CHAR_DEREF(STRING_ELT(r_base, 0));
1576 
1577     doc = xmlReadMemory(txt, INTEGER(len)[0], baseURL, encoding, options);
1578     return(R_createXMLDocRef(doc));
1579 }
1580 
1581 
1582 
1583 #if 1
1584 
1585 int
addXInclude(xmlNodePtr ptr,SEXP * ans,int level,SEXP manageMemory)1586 addXInclude(xmlNodePtr ptr, SEXP *ans, int level, SEXP manageMemory)
1587 {
1588 	if(ptr->type == XML_XINCLUDE_START) {
1589             int len = Rf_length(*ans) + 1;
1590 	    SEXP oans = *ans; // avoid sequence-point error
1591             PROTECT(*ans = SET_LENGTH(oans, len));
1592 	    SET_VECTOR_ELT(*ans, len - 1, R_createXMLNodeRef(ptr, manageMemory));
1593 	    UNPROTECT(1);
1594 	    return(1);
1595 	} else
1596 	    return(0);
1597 
1598 }
1599 
1600 int
processKids(xmlNodePtr ptr,SEXP * ans,int level,SEXP manageMemory)1601 processKids(xmlNodePtr ptr, SEXP *ans, int  level, SEXP manageMemory)
1602 {
1603         xmlNodePtr kids;
1604 	int count = 0;
1605 	kids = ptr->children;
1606         while(kids) {
1607 	    count += addXInclude(kids, ans, level, manageMemory);
1608             count += processKids(kids, ans, level + 1, manageMemory);
1609             kids = kids->next;
1610       	}
1611 	return(count);
1612 }
1613 
1614 #if 0
1615 int
1616 findXIncludeStartNodes(xmlNodePtr node, SEXP *ans, int level)
1617 {
1618     const char * prefix[] = {"", "     ", "           ", "                " };
1619     xmlNodePtr ptr = node;
1620     int count = 0;
1621 
1622     addXInclude(node, ans, level);
1623     ptr = node;
1624     while(ptr) {
1625 	count += addXInclude(ptr, ans, level);
1626 	count += processKids(ptr, ans, level);
1627 	ptr = ptr->next;
1628     }
1629 
1630 //fprintf(stderr, "%s level = %d, %s: %p, type = %d\n", prefix[level], level, ptr->name, node, ptr->type);
1631 
1632 //fprintf(stderr, "%p, %s, level = %d, type = %d\n", ptr, ptr->name, level, ptr->type);
1633 
1634     return(count);
1635 }
1636 #endif
1637 
1638 /*
1639  This is a recursive version. We want an iterative version.
1640  */
1641 SEXP
R_findXIncludeStartNodes(SEXP r_root,SEXP manageMemory)1642 R_findXIncludeStartNodes(SEXP r_root, SEXP manageMemory)
1643 {
1644     xmlNodePtr root;
1645     SEXP ans;
1646 
1647     root = (xmlNodePtr) R_ExternalPtrAddr(r_root);
1648     if(!root)
1649 	return(R_NilValue);
1650 
1651     PROTECT(ans = allocVector(VECSXP, 0));
1652     addXInclude(root, &ans, 0, manageMemory);
1653     processKids(root, &ans, 0, manageMemory);
1654     UNPROTECT(1);
1655     return(ans);
1656 }
1657 
1658 #endif
1659