1isURL = 2function(file) 3{ 4 is.character(file) && grepl("^(http|ftp)", file) 5} 6 7 8############ 9#XXXXXXXXX 10# This is now replaced by copying xmlTreeParse. 11htmlTreeParse <- 12# 13# HTML parser that reads the entire `document' tree into memory 14# and then converts it to an R/S object. 15# Uses the libxml from Daniel Veillard at W3.org. 16# 17# asText treat the value of file as XML text, not the name of a file containing 18# the XML text, and parse that. 19# See also xml 20# 21function(file, ignoreBlanks = TRUE, handlers = NULL, 22 replaceEntities = FALSE, asText = inherits(file, "AsIs") || !isURL && grepl("^<", file), # could have a BOM 23 trim = TRUE, 24 isURL = is.character(file) && grepl("^(http|ftp)", file), 25 asTree = FALSE, useInternalNodes = FALSE, 26 encoding = character(), 27 useDotNames = length(grep("^\\.", names(handlers))) > 0, 28 xinclude = FALSE, addFinalizer = TRUE, error = function(...){}, 29 options = integer(), parentFirst = FALSE) 30{ 31if(TRUE) 32 { 33 doc = xmlTreeParse(file, ignoreBlanks, handlers, replaceEntities, asText, trim, validate = FALSE, 34 getDTD = FALSE, isURL, asTree, addAttributeNamespaces = FALSE, 35 useInternalNodes, isSchema = FALSE, fullNamespaceInfo = FALSE, 36 encoding, useDotNames, xinclude, addFinalizer, error, isHTML = TRUE, options = options) 37 class(doc) = c("HTMLInternalDocument", class(doc)[1]) 38 return(doc) 39 } 40 41 42 if(length(file) > 1) { 43 file = paste(file, collapse = "\n") 44 if(!missing(asText) && !asText) 45 stop("multiple URIs passed to xmlTreeParse. If this is the content of the file, specify asText = TRUE") 46 asText = TRUE 47 } 48 49 50 if(missing(asText) && substring(file, 1, 1) == "<") 51 asText = TRUE 52 53 if(!asText && missing(isURL)) { 54 isURL <- length(grep("^(http|ftp)://", file, useBytes = TRUE, perl = TRUE)) 55 } 56 57 # check whether we are treating the file name as 58 # a) the XML text itself, or b) as a URL. 59 # Otherwise, check if the file exists and report an error. 60 if(asText == FALSE && isURL == FALSE) { 61 if(file.exists(file) == FALSE) 62 stop(paste("File", file, "does not exist ")) 63 } 64 65 if(!asText && !isURL) 66 file = path.expand(file) 67 68 old = setEntitySubstitution(replaceEntities) 69 on.exit(setEntitySubstitution(old)) 70 71 if(!is.logical(xinclude)) { 72 if(inherits(xinclude, "numeric")) 73 xinclude = bitlist(xinclude) 74 else 75 xinclude = as.logical(xinclude) 76 } 77 78 .oldErrorHandler = setXMLErrorHandler(error) 79 on.exit(.Call("RS_XML_setStructuredErrorHandler", .oldErrorHandler, PACKAGE = "XML"), add = TRUE) 80 81 ans <- .Call("RS_XML_ParseTree", as.character(file), handlers, 82 as.logical(ignoreBlanks), as.logical(replaceEntities), 83 as.logical(asText), as.logical(trim), 84 FALSE, FALSE, 85 as.logical(isURL), FALSE, 86 as.logical(useInternalNodes), TRUE, FALSE, FALSE, as.character(encoding), 87 as.logical(useDotNames), xinclude, error, addFinalizer, options, as.logical(parentFirst), PACKAGE = "XML") 88 89 if(!missing(handlers) & !as.logical(asTree)) 90 return(handlers) 91 92 if(inherits(ans, "XMLInternalDocument")) { 93 addDocFinalizer(ans, addFinalizer) 94 class(ans) = c("HTMLInternalDocument", class(ans)) 95 } 96 97 ans 98} 99 100#XXXXXX 101# This is another version that doesn't seem to release the document. Weird. I can't seem to find 102# out who is holding onto it. 103myHTMLParse = 104function(file, ignoreBlanks = TRUE, handlers = NULL, 105 replaceEntities = FALSE, asText = inherits(file, "AsIs") || !isURL && grepl("^<", file), # could have a BOM 106 trim = TRUE, 107 isURL = is.character(file) && grepl("^(http|ftp)", file), 108 asTree = FALSE, useInternalNodes = FALSE, 109 encoding = character(), 110 useDotNames = length(grep("^\\.", names(handlers))) > 0, 111 xinclude = FALSE, addFinalizer = TRUE, error = function(...){}) 112{ 113 doc = xmlTreeParse(file, ignoreBlanks, handlers, replaceEntities, asText, trim, validate = FALSE, 114 getDTD = FALSE, isURL, asTree, addAttributeNamespaces = FALSE, 115 useInternalNodes, isSchema = FALSE, fullNamespaceInfo = FALSE, 116 encoding, useDotNames, xinclude, addFinalizer, error, isHTML = TRUE) 117 class(doc) = c("HTMLInternalDocument", class(doc)[2]) 118 return(doc) 119} 120 121 122hideParseErrors = function (...) NULL 123 124 125htmlTreeParse = xmlTreeParse 126 127 128formals(htmlTreeParse)$error = as.name("htmlErrorHandler") # as.name("hideParseErrors") 129formals(htmlTreeParse)$isHTML = TRUE 130 131htmlParse = htmlTreeParse 132formals(htmlParse)$useInternalNodes = TRUE 133 134 135 136parseURI = 137function(uri) 138{ 139 if(is.na(uri)) 140 return(structure(as.character(uri), class = "URI")) 141 142 u = .Call("R_parseURI", as.character(uri), PACKAGE = "XML") 143 if(u$port == 0) 144 u$port = as.integer(NA) 145 146 class(u) = "URI" 147 148 u 149} 150 151setOldClass("URI") 152setOldClass("URL") 153 154setAs("URI", "character", 155 function(from) { 156 if(from$scheme == "") 157 sprintf("%s%s%s", 158 from["path"], 159 if(from[["query"]] != "") sprintf("?%s", from[["query"]]) else "", 160 if(from[["fragment"]] != "") sprintf("#%s", from[["fragment"]]) else "" ) 161 else 162 sprintf("%s://%s%s%s%s%s%s%s", 163 from[["scheme"]], 164 from[["user"]], 165 if(from[["user"]] != "") "@" else "", 166 from[["server"]], 167 if(!is.na(from[["port"]])) sprintf(":%d", as.integer(from[["port"]])) else "", 168 from["path"], 169 if(from[["query"]] != "") sprintf("?%s", from[["query"]]) else "", 170 if(from[["fragment"]] != "") sprintf("#%s", from[["fragment"]]) else "" 171 ) 172 }) 173 174 175 176