1isURL =
2function(file)
3{
4  is.character(file) && grepl("^(http|ftp)", file)
5}
6
7
8############
9#XXXXXXXXX
10# This is now replaced by copying xmlTreeParse.
11htmlTreeParse <-
12#
13# HTML parser that reads the entire `document' tree into memory
14# and then converts it to an R/S object.
15# Uses the libxml from Daniel Veillard at W3.org.
16#
17# asText  treat the value of file as XML text, not the name of a file containing
18#       the XML text, and parse that.
19# See also xml
20#
21function(file, ignoreBlanks = TRUE, handlers = NULL,
22           replaceEntities = FALSE, asText = inherits(file, "AsIs") || !isURL && grepl("^<", file), # could have a BOM
23            trim = TRUE,
24            isURL = is.character(file) && grepl("^(http|ftp)", file),
25            asTree = FALSE, useInternalNodes = FALSE,
26            encoding = character(),
27            useDotNames = length(grep("^\\.", names(handlers))) > 0,
28            xinclude = FALSE, addFinalizer = TRUE, error = function(...){},
29            options = integer(), parentFirst = FALSE)
30{
31if(TRUE)
32  {
33     doc = xmlTreeParse(file, ignoreBlanks, handlers, replaceEntities, asText, trim, validate = FALSE,
34                      getDTD = FALSE, isURL, asTree, addAttributeNamespaces = FALSE,
35                       useInternalNodes, isSchema = FALSE, fullNamespaceInfo = FALSE,
36                        encoding, useDotNames, xinclude, addFinalizer, error, isHTML = TRUE, options = options)
37     class(doc) = c("HTMLInternalDocument", class(doc)[1])
38     return(doc)
39  }
40
41
42  if(length(file) > 1) {
43   file = paste(file, collapse = "\n")
44    if(!missing(asText) && !asText)
45      stop("multiple URIs passed to xmlTreeParse. If this is the content of the file,  specify asText = TRUE")
46   asText = TRUE
47 }
48
49
50  if(missing(asText) && substring(file, 1, 1) == "<")
51    asText = TRUE
52
53  if(!asText && missing(isURL)) {
54     isURL <- length(grep("^(http|ftp)://", file, useBytes = TRUE, perl = TRUE))
55  }
56
57    # check whether we are treating the file name as
58    # a) the XML text itself, or b) as a URL.
59    # Otherwise, check if the file exists and report an error.
60 if(asText == FALSE && isURL == FALSE) {
61  if(file.exists(file) == FALSE)
62     stop(paste("File", file, "does not exist "))
63 }
64
65 if(!asText && !isURL)
66   file = path.expand(file)
67
68 old = setEntitySubstitution(replaceEntities)
69 on.exit(setEntitySubstitution(old))
70
71 if(!is.logical(xinclude)) {
72   if(inherits(xinclude, "numeric"))
73    xinclude = bitlist(xinclude)
74   else
75     xinclude = as.logical(xinclude)
76 }
77
78 .oldErrorHandler = setXMLErrorHandler(error)
79 on.exit(.Call("RS_XML_setStructuredErrorHandler", .oldErrorHandler, PACKAGE = "XML"), add = TRUE)
80
81 ans <- .Call("RS_XML_ParseTree", as.character(file), handlers,
82         as.logical(ignoreBlanks), as.logical(replaceEntities),
83          as.logical(asText), as.logical(trim),
84           FALSE, FALSE,
85           as.logical(isURL), FALSE,
86           as.logical(useInternalNodes), TRUE, FALSE, FALSE, as.character(encoding),
87           as.logical(useDotNames), xinclude, error, addFinalizer, options, as.logical(parentFirst), PACKAGE = "XML")
88
89 if(!missing(handlers) & !as.logical(asTree))
90   return(handlers)
91
92  if(inherits(ans, "XMLInternalDocument")) {
93    addDocFinalizer(ans, addFinalizer)
94    class(ans) = c("HTMLInternalDocument", class(ans))
95  }
96
97 ans
98}
99
100#XXXXXX
101# This is another version that doesn't seem to release the document. Weird. I can't seem to find
102# out who is holding onto it.
103myHTMLParse =
104function(file, ignoreBlanks = TRUE, handlers = NULL,
105           replaceEntities = FALSE, asText = inherits(file, "AsIs") || !isURL && grepl("^<", file), # could have a BOM
106            trim = TRUE,
107            isURL = is.character(file) && grepl("^(http|ftp)", file),
108            asTree = FALSE, useInternalNodes = FALSE,
109            encoding = character(),
110            useDotNames = length(grep("^\\.", names(handlers))) > 0,
111            xinclude = FALSE, addFinalizer = TRUE, error = function(...){})
112{
113     doc = xmlTreeParse(file, ignoreBlanks, handlers, replaceEntities, asText, trim, validate = FALSE,
114                         getDTD = FALSE, isURL, asTree, addAttributeNamespaces = FALSE,
115                           useInternalNodes, isSchema = FALSE, fullNamespaceInfo = FALSE,
116                            encoding, useDotNames, xinclude, addFinalizer, error, isHTML = TRUE)
117     class(doc) = c("HTMLInternalDocument", class(doc)[2])
118     return(doc)
119}
120
121
122hideParseErrors = function (...) NULL
123
124
125htmlTreeParse = xmlTreeParse
126
127
128formals(htmlTreeParse)$error = as.name("htmlErrorHandler") # as.name("hideParseErrors")
129formals(htmlTreeParse)$isHTML = TRUE
130
131htmlParse = htmlTreeParse
132formals(htmlParse)$useInternalNodes = TRUE
133
134
135
136parseURI =
137function(uri)
138{
139  if(is.na(uri))
140    return(structure(as.character(uri), class = "URI"))
141
142  u = .Call("R_parseURI", as.character(uri), PACKAGE = "XML")
143  if(u$port == 0)
144    u$port = as.integer(NA)
145
146  class(u) = "URI"
147
148  u
149}
150
151setOldClass("URI")
152setOldClass("URL")
153
154setAs("URI", "character",
155      function(from) {
156          if(from$scheme == "")
157              sprintf("%s%s%s",
158                      from["path"],
159                      if(from[["query"]] != "") sprintf("?%s", from[["query"]]) else "",
160                      if(from[["fragment"]] != "") sprintf("#%s", from[["fragment"]]) else "" )
161          else
162           sprintf("%s://%s%s%s%s%s%s%s",
163                                    from[["scheme"]],
164                                    from[["user"]],
165                                    if(from[["user"]] != "") "@" else "",
166                                    from[["server"]],
167                                    if(!is.na(from[["port"]])) sprintf(":%d", as.integer(from[["port"]])) else "",
168                                    from["path"],
169                                    if(from[["query"]] != "") sprintf("?%s", from[["query"]]) else "",
170                                    if(from[["fragment"]] != "") sprintf("#%s", from[["fragment"]]) else ""
171                   )
172      })
173
174
175
176