1context("read_xml")
2
3test_that("read_xml errors with an empty document", {
4  expect_error(read_xml(character()), "Document is empty")
5
6  tf <- tempfile()
7  file.create(tf)
8  on.exit(unlink(tf))
9
10  expect_error(read_xml(tf), "Document is empty")
11})
12
13test_that("read_html correctly parses malformed document", {
14  lego <- read_html("lego.html.bz2")
15  expect_equal(length(xml_find_all(lego, ".//p")), 39)
16})
17
18test_that("parse_options errors when given an invalid option", {
19  expect_error(parse_options("INVALID", xml_parse_options()),
20    "`options` 'INVALID' is not a valid option")
21
22  expect_error(read_html("lego.html.bz2", options = "INVALID"),
23    "`options` 'INVALID' is not a valid option")
24
25  # Empty inputs returned as 0
26  expect_identical(0L, parse_options("", xml_parse_options()))
27  expect_identical(0L, parse_options(NULL, xml_parse_options()))
28
29  # Numerics returned as integers
30  expect_identical(12L, parse_options(12L, xml_parse_options()))
31  expect_identical(12L, parse_options(12, xml_parse_options()))
32
33  # Multiple inputs summed
34  expect_identical(3L, parse_options(c("RECOVER", "NOENT"), xml_parse_options()))
35})
36
37test_that("read_html properly passes parser arguments", {
38
39  skip_if_not(libxml2_version() >= "2.9.2")
40
41  blanks <- read_html(xml2_example("cd_catalog.xml"), options = c("RECOVER", "NOERROR"))
42  expect_equal(as_list(blanks)$html$body$catalog$cd[[1]],
43    "\r\n    ")
44
45  no_blanks <- read_html(xml2_example("cd_catalog.xml"), options = c("RECOVER", "NOERROR", "NOBLANKS"))
46
47  expect_equal(as_list(no_blanks)$html$body$catalog$cd[[1]],
48    list("Empire Burlesque"))
49})
50
51test_that("read_xml works with httr response objects", {
52  skip_on_cran()
53  skip_if_offline()
54  skip_if_not_installed("httr")
55
56  x <- read_xml(httr::GET("http://httpbin.org/xml"))
57  expect_is(x, "xml_document")
58
59  expect_equal(length(xml_find_all(x, "//slide")), 2)
60})
61
62test_that("read_html works with httr response objects", {
63  skip_on_cran()
64  skip_if_offline()
65
66  x <- read_html(httr::GET("http://httpbin.org/xml"))
67  expect_is(x, "xml_document")
68
69  expect_equal(length(xml_find_all(x, "//slide")), 2)
70})
71
72test_that("read_xml works with raw inputs", {
73  x <- read_xml("<foo/>")
74  expect_equal(xml_url(x), NA_character_)
75})
76
77test_that("read_xml and read_html fail for bad status codes", {
78
79  skip_on_cran()
80  skip_if_not_installed("httr")
81  skip_if_offline()
82
83  expect_error(
84    read_xml(httr::GET("http://httpbin.org/status/404")),
85    class = "http_404"
86  )
87
88  expect_error(
89    read_html(httr::GET("http://httpbin.org/status/404")),
90    class = "http_404"
91  )
92})
93
94test_that("read_html works with non-ASCII encodings", {
95  tmp <- tempfile()
96  on.exit(unlink(tmp))
97
98  writeLines("<html><body>\U2019</body></html>", tmp, useBytes = TRUE)
99  res <- read_html(tmp, encoding = "UTF-8")
100
101  expect_equal(as.character(res, options = ""),
102    "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html><body>\U2019</body></html>\n")
103})
104
105test_that("read_xml and read_html fail with > 1 input", {
106  expect_error(read_xml(c("foo", "bar")), "`x` must be a string of length 1")
107  expect_error(read_html(c("foo", "bar")), "`x` must be a string of length 1")
108})
109