1\name{getHTMLLinks} 2\alias{getHTMLLinks} 3\alias{getHTMLExternalFiles} 4\title{Get links or names of external files in HTML document} 5\description{ 6 These functions allow us to retrieve either the links 7 within an HTML document, or the collection of names of 8 external files referenced in an HTML document. 9 The external files include images, JavaScript and CSS documents. 10} 11\usage{ 12getHTMLLinks(doc, externalOnly = TRUE, xpQuery = "//a/@href", 13 baseURL = docName(doc), relative = FALSE) 14getHTMLExternalFiles(doc, xpQuery = c("//img/@src", "//link/@href", 15 "//script/@href", "//embed/@src"), 16 baseURL = docName(doc), relative = FALSE, 17 asNodes = FALSE, recursive = FALSE) 18} 19%- maybe also 'usage' for other objects documented here. 20\arguments{ 21 \item{doc}{the HTML document as a URL, local file name, parsed 22 document or an XML/HTML node} 23 \item{externalOnly}{a logical value that indicates whether we should 24 only return links to external documents and not references to 25 internal anchors/nodes within this document, i.e. those that of the 26 form \code{#foo}.} 27 \item{xpQuery}{a vector of XPath elements which match the elements of interest} 28\item{baseURL}{the URL of the container document. This is used 29 to resolve relative references/links. 30} 31\item{relative}{a logical value indicating whether to leave the 32 references as relative to the base URL or to expand them to their full paths. 33} 34\item{asNodes}{a logical value that indicates whether we want the actual 35 HTML/XML nodes in the document that reference external documents 36 or just the names of the external documents.} 37\item{recursive}{a logical value that controls whether we recursively 38 process the external documents we find in the top-level document 39 examining them for their external files.} 40} 41 42\value{ 43 \code{getHTMLLinks} returns a character vector of the links. 44 45 \code{getHTMLExternalFiles} returns a character vector. 46} 47\author{ 48Duncan Temple Lang 49} 50\seealso{ 51 \code{\link{getXIncludes}} 52} 53\examples{\donttest{ # site is flaky 54 try(getHTMLLinks("http://www.omegahat.net")) 55 56 try(getHTMLLinks("http://www.omegahat.net/RSXML")) 57 58 try(unique(getHTMLExternalFiles("http://www.omegahat.net"))) 59}} 60\keyword{IO} 61\keyword{programming} 62 63