1#/usr/bin/env python
2"""
3test_dataset_info.py
4
5Test that to_matrix() and to_matrix_with_info() return the correct types.
6
7mlpack is free software; you may redistribute it and/or modify it under the
8terms of the 3-clause BSD license.  You should have received a copy of the
93-clause BSD license along with mlpack.  If not, see
10http://www.opensource.org/licenses/BSD-3-Clause for more information.
11"""
12import unittest
13import pandas as pd
14import numpy as np
15
16from mlpack.matrix_utils import to_matrix
17from mlpack.matrix_utils import to_matrix_with_info
18
19class TestToMatrix(unittest.TestCase):
20  """
21  This class defines tests for the to_matrix() and to_matrix_with_info() utility
22  functions.
23  """
24
25  def testPandasToMatrix(self):
26    """
27    Test that a simple pandas numeric matrix can be turned into a numpy ndarray.
28    """
29    d = pd.DataFrame(np.random.randn(100, 4), columns=list('abcd'))
30
31    m, _ = to_matrix(d)
32
33    self.assertTrue(isinstance(m, np.ndarray))
34    self.assertEqual(m.shape[0], 100)
35    self.assertEqual(m.shape[1], 4)
36    self.assertEqual(m.dtype, np.dtype(np.double))
37    colnames = list('abcd')
38    for i in range(m.shape[1]):
39      for j in range(m.shape[0]):
40        self.assertEqual(m[j, i], d[colnames[i]][j])
41
42  def testPandasIntToMatrix(self):
43    """
44    Test that a matrix holding ints is properly turned into a double matrix.
45    """
46    d = pd.DataFrame({'a': range(5)})
47
48    m, _ = to_matrix(d)
49
50    self.assertTrue(isinstance(m, np.ndarray))
51    self.assertEqual(m.shape[0], 5)
52    self.assertEqual(m.shape[1], 1)
53    for i in range(5):
54      self.assertEqual(m[i], i)
55
56  def testPandasMixedToMatrix(self):
57    """
58    Test that a matrix with one int and one double feature are transformed
59    correctly.
60    """
61    d = pd.DataFrame({'a': range(50)})
62    d['b'] = np.random.randn(50, 1)
63    self.assertTrue((d['a'].dtype == np.dtype('int32')) or
64                    (d['a'].dtype == np.dtype('int64')))
65    self.assertEqual(d['b'].dtype, np.dtype(np.double))
66
67    m, _ = to_matrix(d)
68
69    self.assertTrue(isinstance(m, np.ndarray))
70    self.assertEqual(m.dtype, np.dtype(np.double))
71    self.assertEqual(m.shape[0], 50)
72    self.assertEqual(m.shape[1], 2)
73    colNames = list('ab')
74    for i in range(2):
75      for j in range(50):
76        self.assertEqual(d[colNames[i]][j], m[j, i])
77
78  def testArraylikeToMatrix(self):
79    """
80    Test that if we pass some array, we get back the right thing.  This array
81    will be filled with doubles only.
82    """
83    a = [[0.01, 0.02, 0.03],
84         [0.04, 0.05, 0.06],
85         [0.07, 0.08, 0.09],
86         [0.10, 0.11, 0.12]]
87
88    m, _ = to_matrix(a)
89
90    self.assertTrue(isinstance(m, np.ndarray))
91    self.assertEqual(m.dtype, np.dtype(np.double))
92    self.assertEqual(m.shape[0], 4)
93    self.assertEqual(m.shape[1], 3)
94
95    for i in range(4):
96      for j in range(3):
97        self.assertEqual(a[i][j], m[i, j])
98
99  def testMultitypeArraylikeToMatrix(self):
100    """
101    Test that if we pass an array with multiple types, we get back the right
102    thing.  The numpy ndarray should be filled with doubles only.
103    """
104    a = [[0.01, 0.02, 3],
105         [0.04, 0.05, 6],
106         [0.07, 0.08, 9],
107         [0.10, 0.11, 12]]
108
109    m, _ = to_matrix(a)
110
111    self.assertTrue(isinstance(m, np.ndarray))
112    self.assertEqual(m.dtype, np.dtype(np.double))
113    self.assertEqual(m.shape[0], 4)
114    self.assertEqual(m.shape[1], 3)
115
116    for i in range(4):
117      for j in range(3):
118        self.assertEqual(a[i][j], m[i, j])
119
120  def testNumpyToMatrix(self):
121    """
122    Make sure we can convert a numpy matrix without copying anything.
123    """
124    m1 = np.random.randn(100, 5)
125    m2, _ = to_matrix(m1)
126
127    self.assertTrue(isinstance(m2, np.ndarray))
128    self.assertEqual(m2.dtype, np.dtype(np.double))
129
130    p1 = m1.__array_interface__
131    p2 = m2.__array_interface__
132
133    self.assertEqual(p1['data'], p2['data'])
134
135  def testPandasToMatrixNoCategorical(self):
136    """
137    Make sure that if we pass a Pandas dataframe with no categorical features,
138    we get back the matrix we expect.
139    """
140
141class TestToMatrixWithInfo(unittest.TestCase):
142  """
143  This class defines tests for the to_matrix() and to_matrix_with_info() utility
144  functions.
145  """
146
147  def testPandasToMatrix(self):
148    """
149    Test that a simple pandas numeric matrix can be turned into a numpy ndarray.
150    """
151    d = pd.DataFrame(np.random.randn(100, 4), columns=list('abcd'))
152
153    m, _, dims = to_matrix_with_info(d, np.double)
154
155    self.assertTrue(isinstance(m, np.ndarray))
156    self.assertEqual(m.shape[0], 100)
157    self.assertEqual(m.shape[1], 4)
158    self.assertEqual(m.dtype, np.dtype(np.double))
159    colnames = list('abcd')
160    for i in range(m.shape[1]):
161      for j in range(m.shape[0]):
162        self.assertEqual(m[j, i], d[colnames[i]][j])
163
164    self.assertTrue(dims.shape[0], 4)
165    self.assertEqual(dims[0], 0)
166    self.assertEqual(dims[1], 0)
167    self.assertEqual(dims[2], 0)
168    self.assertEqual(dims[3], 0)
169
170  def testPandasIntToMatrix(self):
171    """
172    Test that a matrix holding ints is properly turned into a double matrix.
173    """
174    d = pd.DataFrame({'a': range(5)})
175
176    m, _, dims = to_matrix_with_info(d, np.double)
177
178    self.assertTrue(isinstance(m, np.ndarray))
179    self.assertEqual(m.shape[0], 5)
180    self.assertEqual(m.shape[1], 1)
181    for i in range(5):
182      self.assertEqual(m[i], i)
183
184    self.assertTrue(dims.shape[0], 1)
185    self.assertEqual(dims[0], 0)
186
187  def testPandasMixedToMatrix(self):
188    """
189    Test that a matrix with one int and one double feature are transformed
190    correctly.
191    """
192    d = pd.DataFrame({'a': range(50)})
193    d['b'] = np.random.randn(50, 1)
194    self.assertTrue((d['a'].dtype == np.dtype('int32')) or
195                    (d['a'].dtype == np.dtype('int64')))
196    self.assertEqual(d['b'].dtype, np.dtype(np.double))
197
198    m, _, dims = to_matrix_with_info(d, np.double)
199
200    self.assertTrue(isinstance(m, np.ndarray))
201    self.assertEqual(m.dtype, np.dtype(np.double))
202    self.assertEqual(m.shape[0], 50)
203    self.assertEqual(m.shape[1], 2)
204    colNames = list('ab')
205    for i in range(2):
206      for j in range(50):
207        self.assertEqual(d[colNames[i]][j], m[j, i])
208
209    self.assertEqual(dims.shape[0], 2)
210    self.assertEqual(dims[0], 0)
211    self.assertEqual(dims[1], 0)
212
213  def testArraylikeToMatrix(self):
214    """
215    Test that if we pass some array, we get back the right thing.  This array
216    will be filled with doubles only.
217    """
218    a = [[0.01, 0.02, 0.03],
219         [0.04, 0.05, 0.06],
220         [0.07, 0.08, 0.09],
221         [0.10, 0.11, 0.12]]
222
223    m, _, dims = to_matrix_with_info(a, np.double)
224
225    self.assertTrue(isinstance(m, np.ndarray))
226    self.assertEqual(m.dtype, np.dtype(np.double))
227    self.assertEqual(m.shape[0], 4)
228    self.assertEqual(m.shape[1], 3)
229
230    for i in range(4):
231      for j in range(3):
232        self.assertEqual(a[i][j], m[i, j])
233
234    self.assertEqual(dims.shape[0], 3)
235    self.assertEqual(dims[0], 0)
236    self.assertEqual(dims[1], 0)
237    self.assertEqual(dims[2], 0)
238
239  def testMultitypeArraylikeToMatrix(self):
240    """
241    Test that if we pass an array with multiple types, we get back the right
242    thing.  The numpy ndarray should be filled with doubles only.
243    """
244    a = [[0.01, 0.02, 3],
245         [0.04, 0.05, 6],
246         [0.07, 0.08, 9],
247         [0.10, 0.11, 12]]
248
249    m, _, dims = to_matrix_with_info(a, np.double)
250
251    self.assertTrue(isinstance(m, np.ndarray))
252    self.assertEqual(m.dtype, np.dtype(np.double))
253    self.assertEqual(m.shape[0], 4)
254    self.assertEqual(m.shape[1], 3)
255
256    for i in range(4):
257      for j in range(3):
258        self.assertEqual(a[i][j], m[i, j])
259
260    self.assertEqual(dims.shape[0], 3)
261    self.assertEqual(dims[0], 0)
262    self.assertEqual(dims[1], 0)
263    self.assertEqual(dims[2], 0)
264
265  def testNumpyToMatrix(self):
266    """
267    Make sure we can convert a numpy matrix without copying anything.
268    """
269    m1 = np.random.randn(100, 5)
270    m2, _, dims = to_matrix_with_info(m1, np.double)
271
272    self.assertTrue(isinstance(m2, np.ndarray))
273    self.assertEqual(m2.dtype, np.dtype(np.double))
274
275    p1 = m1.__array_interface__
276    p2 = m2.__array_interface__
277
278    self.assertEqual(p1['data'], p2['data'])
279
280    self.assertEqual(dims.shape[0], 5)
281    self.assertEqual(dims[0], 0)
282    self.assertEqual(dims[1], 0)
283    self.assertEqual(dims[2], 0)
284    self.assertEqual(dims[3], 0)
285    self.assertEqual(dims[4], 0)
286
287  def testCategoricalOnly(self):
288    """
289    Make sure that we can convert a categorical-only Pandas matrix.
290    """
291    d = pd.DataFrame({"A": ["a", "b", "c", "a"] })
292    d["A"] = d["A"].astype('category') # Convert to categorical.
293
294    m, _, dims = to_matrix_with_info(d, np.double)
295
296    self.assertTrue(isinstance(m, np.ndarray))
297    self.assertEqual(m.dtype, np.dtype(np.double))
298
299    self.assertEqual(dims.shape[0], 1)
300    self.assertEqual(dims[0], 1)
301
302    self.assertEqual(m.shape[0], 4)
303    self.assertEqual(m.shape[1], 1)
304    self.assertEqual(m[0], m[3])
305    self.assertTrue(m[0] != m[1])
306    self.assertTrue(m[1] != m[2])
307    self.assertTrue(m[0] != m[2])
308
309def test_suite():
310    """
311    Run all tests.
312    """
313    loader = unittest.TestLoader()
314    suite = unittest.TestSuite()
315    suite.addTest(loader.loadTestsFromTestCase(TestToMatrix))
316    suite.addTest(loader.loadTestsFromTestCase(TestToMatrixWithInfo))
317    return suite
318
319if __name__ == '__main__':
320    unittest.main()
321