1import math
2from collections import OrderedDict
3from datetime import datetime
4
5import pytest
6
7from rpy2 import rinterface
8from rpy2 import robjects
9from rpy2.robjects import vectors
10from rpy2.robjects import conversion
11
12
13class MockNamespace(object):
14    def __getattr__(self, name):
15        return None
16
17
18has_pandas = False
19try:
20    import pandas
21    has_pandas = True
22except:
23    pandas = MockNamespace()
24
25has_numpy = False
26try:
27    import numpy
28    has_numpy = True
29except:
30    numpy = MockNamespace()
31
32if has_pandas:
33    import rpy2.robjects.pandas2ri as rpyp
34
35from rpy2.robjects import default_converter
36from rpy2.robjects.conversion import localconverter
37
38@pytest.mark.skipif(not has_pandas, reason='Package pandas is not installed.')
39class TestPandasConversions(object):
40
41    def testActivate(self):
42        #FIXME: is the following still making sense ?
43        assert rpyp.py2rpy != robjects.conversion.py2rpy
44        l = len(robjects.conversion.py2rpy.registry)
45        k = set(robjects.conversion.py2rpy.registry.keys())
46        rpyp.activate()
47        assert len(conversion.py2rpy.registry) > l
48        rpyp.deactivate()
49        assert len(conversion.py2rpy.registry) == l
50        assert set(conversion.py2rpy.registry.keys()) == k
51
52    def testActivateTwice(self):
53        #FIXME: is the following still making sense ?
54        assert rpyp.py2rpy != robjects.conversion.py2rpy
55        l = len(robjects.conversion.py2rpy.registry)
56        k = set(robjects.conversion.py2rpy.registry.keys())
57        rpyp.activate()
58        rpyp.deactivate()
59        rpyp.activate()
60        assert len(conversion.py2rpy.registry) > l
61        rpyp.deactivate()
62        assert len(conversion.py2rpy.registry) == l
63        assert set(conversion.py2rpy.registry.keys()) == k
64
65    def test_dataframe(self):
66        # Content for test data frame
67        l = (
68            ('b', numpy.array([True, False, True], dtype=numpy.bool_)),
69            ('i', numpy.array([1, 2, 3], dtype='i')),
70            ('f', numpy.array([1, 2, 3], dtype='f')),
71            # ('s', numpy.array([b'b', b'c', b'd'], dtype='S1')),
72            ('u', numpy.array([u'a', u'b', u'c'], dtype='U')),
73            ('dates', [datetime(2012, 5, 2),
74                       datetime(2012, 6, 3),
75                       datetime(2012, 7, 1)])
76        )
77        od = OrderedDict(l)
78        # Pandas data frame
79        pd_df = pandas.core.frame.DataFrame(od)
80        # Convert to R
81        with localconverter(default_converter + rpyp.converter) as cv:
82            rp_df = robjects.conversion.py2rpy(pd_df)
83        assert pd_df.shape[0] == rp_df.nrow
84        assert pd_df.shape[1] == rp_df.ncol
85        # assert tuple(rp_df.rx2('s')) == (b'b', b'c', b'd')
86        assert tuple(rp_df.rx2('u')) == ('a', 'b', 'c')
87
88    def test_dataframe_columnnames(self):
89        pd_df = pandas.DataFrame({'the one': [1, 2], 'the other': [3, 4]})
90        # Convert to R
91        with localconverter(default_converter + rpyp.converter) as cv:
92            rp_df = robjects.conversion.py2rpy(pd_df)
93        assert tuple(rp_df.names) == ('the one', 'the other')
94
95    def test_series(self):
96        Series = pandas.core.series.Series
97        s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
98        with localconverter(default_converter + rpyp.converter) as cv:
99            rp_s = robjects.conversion.py2rpy(s)
100        assert isinstance(rp_s, rinterface.FloatSexpVector)
101
102    @pytest.mark.parametrize('dtype',
103                             ('i',
104                              numpy.int32 if has_pandas else None,
105                              numpy.int8 if has_pandas else None,
106                              numpy.int16 if has_pandas else None,
107                              numpy.int32 if has_pandas else None,
108                              numpy.int64 if has_pandas else None,
109                              numpy.uint8 if has_pandas else None,
110                              numpy.uint16 if has_pandas else None,
111                              pandas.Int32Dtype if has_pandas else None,
112                              pandas.Int64Dtype if has_pandas else None))
113    def test_series_int(self, dtype):
114        Series = pandas.core.series.Series
115        s = Series(range(5),
116                   index=['a', 'b', 'c', 'd', 'e'],
117                   dtype=dtype)
118        with localconverter(default_converter + rpyp.converter) as cv:
119            rp_s = robjects.conversion.py2rpy(s)
120        assert isinstance(rp_s, rinterface.IntSexpVector)
121
122    @pytest.mark.parametrize('dtype',
123                             (pandas.Int32Dtype() if has_pandas else None,
124                              pandas.Int64Dtype() if has_pandas else None))
125    def test_dataframe_int_nan(self, dtype):
126        a = pandas.DataFrame([(numpy.NaN,)], dtype=dtype, columns=['z'])
127        with localconverter(default_converter + rpyp.converter) as cv:
128            b = robjects.conversion.py2rpy(a)
129        assert b[0][0] is rinterface.na_values.NA_Integer
130        with localconverter(default_converter + rpyp.converter) as cv:
131            c = robjects.conversion.rpy2py(b)
132
133    @pytest.mark.parametrize('dtype', (pandas.Int32Dtype() if has_pandas else None,
134                                       pandas.Int64Dtype() if has_pandas else None))
135    def test_series_int_nan(self, dtype):
136        a = pandas.Series((numpy.NaN,), dtype=dtype, index=['z'])
137        with localconverter(default_converter + rpyp.converter) as _:
138            b = robjects.conversion.py2rpy(a)
139        assert b[0] is rinterface.na_values.NA_Integer
140        with localconverter(default_converter + rpyp.converter) as _:
141            c = robjects.conversion.rpy2py(b)
142
143    @pytest.mark.skipif(not (has_numpy and has_pandas),
144                        reason='Packages numpy and pandas must be installed.')
145    @pytest.mark.parametrize(
146        'data',
147        (['x', 'y', 'z'],
148         ['x', 'y', None],
149         ['x', 'y', numpy.nan],
150         ['x', 'y', pandas.NA])
151    )
152    @pytest.mark.parametrize(
153        'dtype', ['O', pandas.StringDtype() if has_pandas else None]
154    )
155    def test_series_obj_str(self, data, dtype):
156        Series = pandas.core.series.Series
157        s = Series(data, index=['a', 'b', 'c'], dtype=dtype)
158        with localconverter(default_converter + rpyp.converter) as cv:
159            rp_s = robjects.conversion.py2rpy(s)
160        assert isinstance(rp_s, rinterface.StrSexpVector)
161
162    def test_series_obj_mixed(self):
163        Series = pandas.core.series.Series
164        s = Series(['x', 1, False], index=['a', 'b', 'c'])
165        with localconverter(default_converter + rpyp.converter) as cv:
166            with pytest.raises(ValueError):
167                rp_s = robjects.conversion.py2rpy(s)
168
169        s = Series(['x', 1, None], index=['a', 'b', 'c'])
170        with localconverter(default_converter + rpyp.converter) as cv:
171            with pytest.raises(ValueError):
172                rp_s = robjects.conversion.py2rpy(s)
173
174
175    def test_series_obj_bool(self):
176        Series = pandas.core.series.Series
177        s = Series([True, False, True], index=['a', 'b', 'c'])
178        with localconverter(default_converter + rpyp.converter) as cv:
179            rp_s = robjects.conversion.py2rpy(s)
180        assert isinstance(rp_s, rinterface.BoolSexpVector)
181
182        s = Series([True, False, None], index=['a', 'b', 'c'])
183        with localconverter(default_converter + rpyp.converter) as cv:
184            rp_s = robjects.conversion.py2rpy(s)
185        assert isinstance(rp_s, rinterface.BoolSexpVector)
186
187
188    def test_series_obj_allnone(self):
189        Series = pandas.core.series.Series
190        s = Series([None, None, None], index=['a', 'b', 'c'])
191        with localconverter(default_converter + rpyp.converter) as cv:
192            rp_s = robjects.conversion.py2rpy(s)
193        assert isinstance(rp_s, rinterface.BoolSexpVector)
194
195
196    def test_series_issue264(self):
197        Series = pandas.core.series.Series
198        s = Series(('a', 'b', 'c', 'd', 'e'),
199                   index=pandas.Int64Index([0,1,2,3,4]))
200        with localconverter(default_converter + rpyp.converter) as cv:
201            rp_s = robjects.conversion.py2rpy(s)
202        # segfault before the fix
203        str(rp_s)
204        assert isinstance(rp_s, rinterface.StrSexpVector)
205
206    def test_object2String(self):
207        series = pandas.Series(["a","b","c","a"], dtype="O")
208        with localconverter(default_converter + rpyp.converter) as cv:
209            rp_c = robjects.conversion.py2rpy(series)
210            assert isinstance(rp_c, rinterface.StrSexpVector)
211
212    def test_object2String_with_None(self):
213        series = pandas.Series([None, "a","b","c","a"], dtype="O")
214        with localconverter(default_converter + rpyp.converter) as cv:
215            rp_c = robjects.conversion.py2rpy(series)
216            assert isinstance(rp_c, rinterface.StrSexpVector)
217
218    def test_factor2Category(self):
219        factor = robjects.vectors.FactorVector(('a', 'b', 'a'))
220        with localconverter(default_converter + rpyp.converter) as cv:
221            rp_c = robjects.conversion.rpy2py(factor)
222        assert isinstance(rp_c, pandas.Categorical)
223
224    def test_factorwithNA2Category(self):
225        factor = robjects.vectors.FactorVector(('a', 'b', 'a', None))
226        assert factor[3] is rinterface.na_values.NA_Integer
227        with localconverter(default_converter + rpyp.converter) as cv:
228            rp_c = robjects.conversion.rpy2py(factor)
229        assert isinstance(rp_c, pandas.Categorical)
230        assert math.isnan(rp_c[3])
231
232    def test_orderedFactor2Category(self):
233        factor = robjects.vectors.FactorVector(('a', 'b', 'a'), ordered=True)
234        with localconverter(default_converter + rpyp.converter) as cv:
235            rp_c = robjects.conversion.rpy2py(factor)
236        assert isinstance(rp_c, pandas.Categorical)
237
238    def test_category2Factor(self):
239        category = pandas.Series(["a","b","c","a"], dtype="category")
240        with localconverter(default_converter + rpyp.converter) as cv:
241            rp_c = robjects.conversion.py2rpy(category)
242            assert isinstance(rp_c, robjects.vectors.FactorVector)
243
244    def test_categorywithNA2Factor(self):
245        category = pandas.Series(['a', 'b', 'c', numpy.nan], dtype='category')
246        with localconverter(default_converter + rpyp.converter) as cv:
247            rp_c = robjects.conversion.py2rpy(category)
248            assert isinstance(rp_c, robjects.vectors.FactorVector)
249        assert rp_c[3] == rinterface.NA_Integer
250
251    def test_orderedCategory2Factor(self):
252        category = pandas.Series(pandas.Categorical(['a','b','c','a'],
253                                                    categories=['a','b','c'],
254                                                    ordered=True))
255        with localconverter(default_converter + rpyp.converter) as cv:
256            rp_c = robjects.conversion.py2rpy(category)
257            assert isinstance(rp_c, robjects.vectors.FactorVector)
258
259    def test_datetime2posixct(self):
260        datetime = pandas.Series(
261            pandas.date_range('2017-01-01 00:00:00.234',
262                              periods=20, freq='ms', tz='UTC')
263        )
264        with localconverter(default_converter + rpyp.converter) as cv:
265            rp_c = robjects.conversion.py2rpy(datetime)
266            assert isinstance(rp_c, robjects.vectors.POSIXct)
267            assert int(rp_c[0]) == 1483228800
268            assert int(rp_c[1]) == 1483228800
269            assert rp_c[0] != rp_c[1]
270
271    def test_datetime2posixct_withNA(self):
272        datetime = pandas.Series(
273            pandas.date_range('2017-01-01 00:00:00.234',
274                              periods=20, freq='ms', tz='UTC')
275        )
276        datetime[1] = pandas.NaT
277        with localconverter(default_converter + rpyp.converter) as cv:
278            rp_c = robjects.conversion.py2rpy(datetime)
279            assert isinstance(rp_c, robjects.vectors.POSIXct)
280            assert int(rp_c[0]) == 1483228800
281            assert math.isnan(rp_c[1])
282            assert rp_c[0] != rp_c[1]
283
284    def test_date2posixct(self):
285        today = datetime.now().date()
286        date = pandas.Series([today])
287        with localconverter(default_converter + rpyp.converter) as cv:
288            rp_c = robjects.conversion.py2rpy(date)
289            assert isinstance(rp_c, robjects.vectors.FloatSexpVector)
290            assert tuple(int(x) for x in rp_c) == (today.toordinal(), )
291
292    def test_timeR2Pandas(self):
293        tzone = robjects.vectors.get_timezone()
294        dt = [datetime(1960, 5, 2),
295              datetime(1970, 6, 3),
296              datetime(2012, 7, 1)]
297        dt = [x.replace(tzinfo=tzone) for x in dt]
298        # fix the time
299        ts = [x.timestamp() for x in dt]
300        # Create an R POSIXct vector.
301        r_time = robjects.baseenv['as.POSIXct'](
302            rinterface.FloatSexpVector(ts),
303            origin=rinterface.StrSexpVector(('1970-01-01',))
304        )
305
306        # Convert R POSIXct vector to pandas-compatible vector
307        with localconverter(default_converter + rpyp.converter) as cv:
308            py_time = robjects.conversion.rpy2py(r_time)
309
310        # Check that the round trip did not introduce changes
311        for expected, obtained in zip(dt, py_time):
312            assert expected == obtained.to_pydatetime()
313
314        # Try with NA.
315        r_time[1] = rinterface.na_values.NA_Real
316        # Convert R POSIXct vector to pandas-compatible vector
317        with localconverter(default_converter + rpyp.converter) as cv:
318            py_time = robjects.conversion.rpy2py(r_time)
319
320        assert py_time[1] is pandas.NaT
321
322    def test_posixct_in_dataframe_to_pandas(self):
323        tzone = robjects.vectors.get_timezone()
324        dt = [datetime(1960, 5, 2),
325              datetime(1970, 6, 3),
326              datetime(2012, 7, 1)]
327        dt = [x.replace(tzinfo=tzone) for x in dt]
328        # fix the time
329        ts = [x.timestamp() for x in dt]
330        # Create an R data.frame with a posixct_vector.
331        r_dataf = robjects.vectors.DataFrame({
332            'mydate': robjects.baseenv['as.POSIXct'](
333                rinterface.FloatSexpVector(ts),
334                origin=rinterface.StrSexpVector(('1970-01-01',))
335            )})
336
337        # Convert R POSIXct vector to pandas-compatible vector
338        with localconverter(default_converter + rpyp.converter):
339            py_dataf = robjects.conversion.rpy2py(r_dataf)
340        assert pandas.core.dtypes.common.is_datetime64_any_dtype(py_dataf['mydate'])
341
342    def test_repr(self):
343        # this should go to testVector, with other tests for repr()
344        l = (('b', numpy.array([True, False, True], dtype=numpy.bool_)),
345             ('i', numpy.array([1, 2, 3], dtype="i")),
346             ('f', numpy.array([1, 2, 3], dtype="f")),
347             ('s', numpy.array(["a", "b", "c"], dtype="S")),
348             ('u', numpy.array([u"a", u"b", u"c"], dtype="U")))
349        od = OrderedDict(l)
350        pd_df = pandas.core.frame.DataFrame(od)
351        with localconverter(default_converter + rpyp.converter) as cv:
352            rp_df = robjects.conversion.py2rpy(pd_df)
353        s = repr(rp_df)  # used to fail with a TypeError.
354        s = s.split('\n')
355        repr_str = ('[BoolSex..., IntSexp..., FloatSe..., '
356                    'ByteSex..., StrSexp...]')
357        assert repr_str == s[2].strip()
358
359        # Try again with the conversion still active.
360        with localconverter(default_converter + rpyp.converter) as cv:
361            rp_df = robjects.conversion.py2rpy(pd_df)
362            s = repr(rp_df)  # used to fail with a TypeError.
363        s = s.split('\n')
364        assert repr_str == s[2].strip()
365
366    def test_ri2pandas(self):
367        rdataf = robjects.r('data.frame(a=1:2, '
368                            '           b=I(c("a", "b")), '
369                            '           c=c("a", "b"))')
370        with localconverter(default_converter + rpyp.converter) as cv:
371            pandas_df = robjects.conversion.rpy2py(rdataf)
372
373        assert isinstance(pandas_df, pandas.DataFrame)
374        assert ('a', 'b', 'c') == tuple(pandas_df.keys())
375        assert pandas_df['a'].dtype in (numpy.dtype('int32'),
376                                        numpy.dtype('int64'))
377        assert pandas_df['b'].dtype == numpy.dtype('O')
378        assert isinstance(pandas_df['c'].dtype,
379                          pandas.api.types.CategoricalDtype)
380
381    def test_ri2pandas(self):
382        rdataf = robjects.r('data.frame(a=1:2, '
383                            '           row.names=c("a", "b"))')
384        with localconverter(default_converter + rpyp.converter) as cv:
385            pandas_df = cv.rpy2py(rdataf)
386        assert all(x == y for x, y in zip(rdataf.rownames, pandas_df.index))
387
388    def test_ri2pandas_issue207(self):
389        d = robjects.DataFrame({'x': 1})
390        with localconverter(default_converter + rpyp.converter) as cv:
391            try:
392                ok = True
393                robjects.globalenv['d'] = d
394            except ValueError:
395                ok = False
396            finally:
397                if 'd' in robjects.globalenv:
398                    del(robjects.globalenv['d'])
399        assert ok
400