1 #include <gtest/gtest.h>
2 #include <dmlc/filesystem.h>
3 #include <fstream>
4 #include <memory>
5 #include <vector>
6 
7 #include "xgboost/data.h"
8 #include "../helpers.h"
9 
10 namespace xgboost {
TEST(SparsePage,PushCSC)11 TEST(SparsePage, PushCSC) {
12   std::vector<bst_row_t> offset {0};
13   std::vector<Entry> data;
14   SparsePage batch;
15   batch.offset.HostVector() = offset;
16   batch.data.HostVector() = data;
17 
18   offset = {0, 1, 4};
19   for (size_t i = 0; i < offset.back(); ++i) {
20     data.emplace_back(Entry(i, 0.1f));
21   }
22 
23   SparsePage other;
24   other.offset.HostVector() = offset;
25   other.data.HostVector() = data;
26 
27   batch.PushCSC(other);
28 
29   ASSERT_EQ(batch.offset.HostVector().size(), offset.size());
30   ASSERT_EQ(batch.data.HostVector().size(), data.size());
31   for (size_t i = 0; i < offset.size(); ++i) {
32     ASSERT_EQ(batch.offset.HostVector()[i], offset[i]);
33   }
34   for (size_t i = 0; i < data.size(); ++i) {
35     ASSERT_EQ(batch.data.HostVector()[i].index, data[i].index);
36   }
37 
38   batch.PushCSC(other);
39   ASSERT_EQ(batch.offset.HostVector().size(), offset.size());
40   ASSERT_EQ(batch.data.Size(), data.size() * 2);
41 
42   for (size_t i = 0; i < offset.size(); ++i) {
43     ASSERT_EQ(batch.offset.HostVector()[i], offset[i] * 2);
44   }
45 
46   auto page = batch.GetView();
47   auto inst = page[0];
48   ASSERT_EQ(inst.size(), 2ul);
49   for (auto entry : inst) {
50     ASSERT_EQ(entry.index, 0u);
51   }
52 
53   inst = page[1];
54   ASSERT_EQ(inst.size(), 6ul);
55   std::vector<size_t> indices_sol {1, 2, 3};
56   for (size_t i = 0; i < inst.size(); ++i) {
57     ASSERT_EQ(inst[i].index, indices_sol[i % 3]);
58   }
59 }
60 
TEST(SparsePage,PushCSCAfterTranspose)61 TEST(SparsePage, PushCSCAfterTranspose) {
62   size_t constexpr kPageSize = 1024, kEntriesPerCol = 3;
63   size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
64   std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
65   const int ncols = dmat->Info().num_col_;
66   SparsePage page; // Consolidated sparse page
67   for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
68     // Transpose each batch and push
69     SparsePage tmp = batch.GetTranspose(ncols);
70     page.PushCSC(tmp);
71   }
72 
73   // Make sure that the final sparse page has the right number of entries
74   ASSERT_EQ(kEntries, page.data.Size());
75 
76   page.SortRows();
77   auto v = page.GetView();
78   for (size_t i = 0; i < v.Size(); ++i) {
79     auto column = v[i];
80     for (size_t j = 1; j < column.size(); ++j) {
81       ASSERT_GE(column[j].fvalue, column[j-1].fvalue);
82     }
83   }
84 }
85 
TEST(DMatrix,Uri)86 TEST(DMatrix, Uri) {
87   size_t constexpr kRows {16};
88   size_t constexpr kCols {8};
89   std::vector<float> data (kRows * kCols);
90 
91   for (size_t i = 0; i < kRows * kCols; ++i) {
92     data[i] = i;
93   }
94 
95   dmlc::TemporaryDirectory tmpdir;
96   std::string path = tmpdir.path + "/small.csv";
97 
98   std::ofstream fout(path);
99   size_t i = 0;
100   for (size_t r = 0; r < kRows; ++r) {
101     for (size_t c = 0; c < kCols; ++c) {
102       fout << data[i];
103       i++;
104       if (c != kCols - 1) {
105         fout << ",";
106       }
107     }
108     fout << "\n";
109   }
110   fout.flush();
111   fout.close();
112 
113   std::unique_ptr<DMatrix> dmat;
114   // FIXME(trivialfis): Enable the following test by restricting csv parser in dmlc-core.
115   // EXPECT_THROW(dmat.reset(DMatrix::Load(path, false, true)), dmlc::Error);
116 
117   std::string uri = path + "?format=csv";
118   dmat.reset(DMatrix::Load(uri, false, true));
119 
120   ASSERT_EQ(dmat->Info().num_col_, kCols);
121   ASSERT_EQ(dmat->Info().num_row_, kRows);
122 }
123 }  // namespace xgboost
124