1 #include <gtest/gtest.h>
2 #include <dmlc/filesystem.h>
3 #include <fstream>
4 #include <memory>
5 #include <vector>
6
7 #include "xgboost/data.h"
8 #include "../helpers.h"
9
10 namespace xgboost {
TEST(SparsePage,PushCSC)11 TEST(SparsePage, PushCSC) {
12 std::vector<bst_row_t> offset {0};
13 std::vector<Entry> data;
14 SparsePage batch;
15 batch.offset.HostVector() = offset;
16 batch.data.HostVector() = data;
17
18 offset = {0, 1, 4};
19 for (size_t i = 0; i < offset.back(); ++i) {
20 data.emplace_back(Entry(i, 0.1f));
21 }
22
23 SparsePage other;
24 other.offset.HostVector() = offset;
25 other.data.HostVector() = data;
26
27 batch.PushCSC(other);
28
29 ASSERT_EQ(batch.offset.HostVector().size(), offset.size());
30 ASSERT_EQ(batch.data.HostVector().size(), data.size());
31 for (size_t i = 0; i < offset.size(); ++i) {
32 ASSERT_EQ(batch.offset.HostVector()[i], offset[i]);
33 }
34 for (size_t i = 0; i < data.size(); ++i) {
35 ASSERT_EQ(batch.data.HostVector()[i].index, data[i].index);
36 }
37
38 batch.PushCSC(other);
39 ASSERT_EQ(batch.offset.HostVector().size(), offset.size());
40 ASSERT_EQ(batch.data.Size(), data.size() * 2);
41
42 for (size_t i = 0; i < offset.size(); ++i) {
43 ASSERT_EQ(batch.offset.HostVector()[i], offset[i] * 2);
44 }
45
46 auto page = batch.GetView();
47 auto inst = page[0];
48 ASSERT_EQ(inst.size(), 2ul);
49 for (auto entry : inst) {
50 ASSERT_EQ(entry.index, 0u);
51 }
52
53 inst = page[1];
54 ASSERT_EQ(inst.size(), 6ul);
55 std::vector<size_t> indices_sol {1, 2, 3};
56 for (size_t i = 0; i < inst.size(); ++i) {
57 ASSERT_EQ(inst[i].index, indices_sol[i % 3]);
58 }
59 }
60
TEST(SparsePage,PushCSCAfterTranspose)61 TEST(SparsePage, PushCSCAfterTranspose) {
62 size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
63 size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
64 std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
65 const int ncols = dmat->Info().num_col_;
66 SparsePage page; // Consolidated sparse page
67 for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
68 // Transpose each batch and push
69 SparsePage tmp = batch.GetTranspose(ncols);
70 page.PushCSC(tmp);
71 }
72
73 // Make sure that the final sparse page has the right number of entries
74 ASSERT_EQ(kEntries, page.data.Size());
75
76 page.SortRows();
77 auto v = page.GetView();
78 for (size_t i = 0; i < v.Size(); ++i) {
79 auto column = v[i];
80 for (size_t j = 1; j < column.size(); ++j) {
81 ASSERT_GE(column[j].fvalue, column[j-1].fvalue);
82 }
83 }
84 }
85
TEST(DMatrix,Uri)86 TEST(DMatrix, Uri) {
87 size_t constexpr kRows {16};
88 size_t constexpr kCols {8};
89 std::vector<float> data (kRows * kCols);
90
91 for (size_t i = 0; i < kRows * kCols; ++i) {
92 data[i] = i;
93 }
94
95 dmlc::TemporaryDirectory tmpdir;
96 std::string path = tmpdir.path + "/small.csv";
97
98 std::ofstream fout(path);
99 size_t i = 0;
100 for (size_t r = 0; r < kRows; ++r) {
101 for (size_t c = 0; c < kCols; ++c) {
102 fout << data[i];
103 i++;
104 if (c != kCols - 1) {
105 fout << ",";
106 }
107 }
108 fout << "\n";
109 }
110 fout.flush();
111 fout.close();
112
113 std::unique_ptr<DMatrix> dmat;
114 // FIXME(trivialfis): Enable the following test by restricting csv parser in dmlc-core.
115 // EXPECT_THROW(dmat.reset(DMatrix::Load(path, false, true)), dmlc::Error);
116
117 std::string uri = path + "?format=csv";
118 dmat.reset(DMatrix::Load(uri, false, true));
119
120 ASSERT_EQ(dmat->Info().num_col_, kCols);
121 ASSERT_EQ(dmat->Info().num_row_, kRows);
122 }
123 } // namespace xgboost
124