1#! /usr/bin/env python
2from __future__ import division, with_statement, absolute_import, print_function
3
4__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
5
6__license__ = """
7Permission is hereby granted, free of charge, to any person obtaining a copy
8of this software and associated documentation files (the "Software"), to deal
9in the Software without restriction, including without limitation the rights
10to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11copies of the Software, and to permit persons to whom the Software is
12furnished to do so, subject to the following conditions:
13
14The above copyright notice and this permission notice shall be included in
15all copies or substantial portions of the Software.
16
17THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23THE SOFTWARE.
24"""
25
26import numpy as np
27import numpy.linalg as la
28import sys
29
30from six.moves import range
31import pytest
32
33import pyopencl as cl
34import pyopencl.array as cl_array
35import pyopencl.cltypes as cltypes
36import pyopencl.tools as cl_tools
37from pyopencl.tools import (  # noqa
38        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
39from pyopencl.characterize import has_double_support, has_struct_arg_count_bug
40
41from pyopencl.clrandom import RanluxGenerator, PhiloxGenerator, ThreefryGenerator
42
43_PYPY = cl._PYPY
44
45
46# {{{ helpers
47
48TO_REAL = {
49        np.dtype(np.complex64): np.float32,
50        np.dtype(np.complex128): np.float64
51        }
52
53
54def general_clrand(queue, shape, dtype):
55    from pyopencl.clrandom import rand as clrand
56
57    dtype = np.dtype(dtype)
58    if dtype.kind == "c":
59        real_dtype = dtype.type(0).real.dtype
60        return clrand(queue, shape, real_dtype) + 1j*clrand(queue, shape, real_dtype)
61    else:
62        return clrand(queue, shape, dtype)
63
64
65def make_random_array(queue, dtype, size):
66    from pyopencl.clrandom import rand
67
68    dtype = np.dtype(dtype)
69    if dtype.kind == "c":
70        real_dtype = TO_REAL[dtype]
71        return (rand(queue, shape=(size,), dtype=real_dtype).astype(dtype)
72                + rand(queue, shape=(size,), dtype=real_dtype).astype(dtype)
73                * dtype.type(1j))
74    else:
75        return rand(queue, shape=(size,), dtype=dtype)
76
77# }}}
78
79
80# {{{ dtype-related
81
82def test_basic_complex(ctx_factory):
83    context = ctx_factory()
84    queue = cl.CommandQueue(context)
85
86    from pyopencl.clrandom import rand
87
88    size = 500
89
90    ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64)
91            + rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) * 1j)
92    c = np.complex64(5+7j)
93
94    host_ary = ary.get()
95    assert la.norm((ary*c).get() - c*host_ary) < 1e-5 * la.norm(host_ary)
96
97
98def test_mix_complex(ctx_factory):
99    context = ctx_factory()
100    queue = cl.CommandQueue(context)
101
102    size = 10
103
104    dtypes = [
105            (np.float32, np.complex64),
106            #(np.int32, np.complex64),
107            ]
108
109    dev = context.devices[0]
110    if has_double_support(dev) and has_struct_arg_count_bug(dev) == "apple":
111        dtypes.extend([
112            (np.float32, np.float64),
113            ])
114    elif has_double_support(dev):
115        dtypes.extend([
116            (np.float32, np.float64),
117            (np.float32, np.complex128),
118            (np.float64, np.complex64),
119            (np.float64, np.complex128),
120            ])
121
122    from operator import add, mul, sub, truediv
123    for op in [add, sub, mul, truediv, pow]:
124        for dtype_a0, dtype_b0 in dtypes:
125            for dtype_a, dtype_b in [
126                    (dtype_a0, dtype_b0),
127                    (dtype_b0, dtype_a0),
128                    ]:
129                for is_scalar_a, is_scalar_b in [
130                        (False, False),
131                        (False, True),
132                        (True, False),
133                        ]:
134                    if is_scalar_a:
135                        ary_a = make_random_array(queue, dtype_a, 1).get()[0]
136                        host_ary_a = ary_a
137                    else:
138                        ary_a = make_random_array(queue, dtype_a, size)
139                        host_ary_a = ary_a.get()
140
141                    if is_scalar_b:
142                        ary_b = make_random_array(queue, dtype_b, 1).get()[0]
143                        host_ary_b = ary_b
144                    else:
145                        ary_b = make_random_array(queue, dtype_b, size)
146                        host_ary_b = ary_b.get()
147
148                    print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b)
149                    dev_result = op(ary_a, ary_b).get()
150                    host_result = op(host_ary_a, host_ary_b)
151
152                    if host_result.dtype != dev_result.dtype:
153                        # This appears to be a numpy bug, where we get
154                        # served a Python complex that is really a
155                        # smaller numpy complex.
156
157                        print("HOST_DTYPE: %s DEV_DTYPE: %s" % (
158                                host_result.dtype, dev_result.dtype))
159
160                        dev_result = dev_result.astype(host_result.dtype)
161
162                    err = la.norm(host_result-dev_result)/la.norm(host_result)
163                    print(err)
164                    correct = err < 1e-4
165                    if not correct:
166                        print(host_result)
167                        print(dev_result)
168                        print(host_result - dev_result)
169
170                    assert correct
171
172
173def test_pow_neg1_vs_inv(ctx_factory):
174    ctx = ctx_factory()
175    queue = cl.CommandQueue(ctx)
176
177    device = ctx.devices[0]
178    if not has_double_support(device):
179        from pytest import skip
180        skip("double precision not supported on %s" % device)
181    if has_struct_arg_count_bug(device) == "apple":
182        from pytest import xfail
183        xfail("apple struct arg counting broken")
184
185    a_dev = make_random_array(queue, np.complex128, 20000)
186
187    res1 = (a_dev ** (-1)).get()
188    res2 = (1/a_dev).get()
189    ref = 1/a_dev.get()
190
191    assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13
192    assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13
193
194
195def test_vector_fill(ctx_factory):
196    context = ctx_factory()
197    queue = cl.CommandQueue(context)
198
199    a_gpu = cl_array.Array(queue, 100, dtype=cltypes.float4)
200    a_gpu.fill(cltypes.make_float4(0.0, 0.0, 1.0, 0.0))
201    a = a_gpu.get()
202    assert a.dtype == cltypes.float4
203
204    a_gpu = cl_array.zeros(queue, 100, dtype=cltypes.float4)
205
206
207def test_absrealimag(ctx_factory):
208    context = ctx_factory()
209    queue = cl.CommandQueue(context)
210
211    def real(x):
212        return x.real
213
214    def imag(x):
215        return x.imag
216
217    def conj(x):
218        return x.conj()
219
220    n = 111
221    for func in [abs, real, imag, conj]:
222        for dtype in [np.int32, np.float32, np.complex64]:
223            print(func, dtype)
224            a = -make_random_array(queue, dtype, n)
225
226            host_res = func(a.get())
227            dev_res = func(a).get()
228
229            correct = np.allclose(dev_res, host_res)
230            if not correct:
231                print(dev_res)
232                print(host_res)
233                print(dev_res-host_res)
234            assert correct
235
236
237def test_custom_type_zeros(ctx_factory):
238    context = ctx_factory()
239    queue = cl.CommandQueue(context)
240
241    if not (
242            queue._get_cl_version() >= (1, 2)
243            and cl.get_cl_header_version() >= (1, 2)):
244        pytest.skip("CL1.2 not available")
245
246    dtype = np.dtype([
247        ("cur_min", np.int32),
248        ("cur_max", np.int32),
249        ("pad", np.int32),
250        ])
251
252    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
253
254    name = "mmc_type"
255    dtype, c_decl = match_dtype_to_c_struct(queue.device, name, dtype)
256    dtype = get_or_register_dtype(name, dtype)
257
258    n = 1000
259    z_dev = cl.array.zeros(queue, n, dtype=dtype)
260
261    z = z_dev.get()
262
263    assert np.array_equal(np.zeros(n, dtype), z)
264
265
266def test_custom_type_fill(ctx_factory):
267    context = ctx_factory()
268    queue = cl.CommandQueue(context)
269
270    from pyopencl.characterize import has_struct_arg_count_bug
271    if has_struct_arg_count_bug(queue.device):
272        pytest.skip("device has LLVM arg counting bug")
273
274    dtype = np.dtype([
275        ("cur_min", np.int32),
276        ("cur_max", np.int32),
277        ("pad", np.int32),
278        ])
279
280    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
281
282    name = "mmc_type"
283    dtype, c_decl = match_dtype_to_c_struct(queue.device, name, dtype)
284    dtype = get_or_register_dtype(name, dtype)
285
286    n = 1000
287    z_dev = cl.array.empty(queue, n, dtype=dtype)
288    z_dev.fill(np.zeros((), dtype))
289
290    z = z_dev.get()
291
292    assert np.array_equal(np.zeros(n, dtype), z)
293
294
295def test_custom_type_take_put(ctx_factory):
296    context = ctx_factory()
297    queue = cl.CommandQueue(context)
298
299    dtype = np.dtype([
300        ("cur_min", np.int32),
301        ("cur_max", np.int32),
302        ])
303
304    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
305
306    name = "tp_type"
307    dtype, c_decl = match_dtype_to_c_struct(queue.device, name, dtype)
308    dtype = get_or_register_dtype(name, dtype)
309
310    n = 100
311    z = np.empty(100, dtype)
312    z["cur_min"] = np.arange(n)
313    z["cur_max"] = np.arange(n)**2
314
315    z_dev = cl.array.to_device(queue, z)
316    ind = cl.array.arange(queue, n, step=3, dtype=np.int32)
317
318    z_ind_ref = z[ind.get()]
319    z_ind = z_dev[ind]
320
321    assert np.array_equal(z_ind.get(), z_ind_ref)
322
323# }}}
324
325
326# {{{ operators
327
328def test_rmul_yields_right_type(ctx_factory):
329    context = ctx_factory()
330    queue = cl.CommandQueue(context)
331
332    a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
333    a_gpu = cl_array.to_device(queue, a)
334
335    two_a = 2*a_gpu
336    assert isinstance(two_a, cl_array.Array)
337
338    two_a = np.float32(2)*a_gpu
339    assert isinstance(two_a, cl_array.Array)
340
341
342def test_pow_array(ctx_factory):
343    context = ctx_factory()
344    queue = cl.CommandQueue(context)
345
346    a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
347    a_gpu = cl_array.to_device(queue, a)
348
349    result = pow(a_gpu, a_gpu).get()
350    assert (np.abs(a ** a - result) < 3e-3).all()
351
352    result = (a_gpu ** a_gpu).get()
353    assert (np.abs(pow(a, a) - result) < 3e-3).all()
354
355
356def test_pow_number(ctx_factory):
357    context = ctx_factory()
358    queue = cl.CommandQueue(context)
359
360    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
361    a_gpu = cl_array.to_device(queue, a)
362
363    result = pow(a_gpu, 2).get()
364    assert (np.abs(a ** 2 - result) < 1e-3).all()
365
366
367def test_multiply(ctx_factory):
368    """Test the muliplication of an array with a scalar. """
369
370    context = ctx_factory()
371    queue = cl.CommandQueue(context)
372
373    for sz in [10, 50000]:
374        for dtype, scalars in [
375                (np.float32, [2]),
376                (np.complex64, [2j]),
377                ]:
378            for scalar in scalars:
379                a_gpu = make_random_array(queue, dtype, sz)
380                a = a_gpu.get()
381                a_mult = (scalar * a_gpu).get()
382
383                assert (a * scalar == a_mult).all()
384
385
386def test_multiply_array(ctx_factory):
387    """Test the multiplication of two arrays."""
388
389    context = ctx_factory()
390    queue = cl.CommandQueue(context)
391
392    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
393
394    a_gpu = cl_array.to_device(queue, a)
395    b_gpu = cl_array.to_device(queue, a)
396
397    a_squared = (b_gpu * a_gpu).get()
398
399    assert (a * a == a_squared).all()
400
401
402def test_addition_array(ctx_factory):
403    """Test the addition of two arrays."""
404
405    context = ctx_factory()
406    queue = cl.CommandQueue(context)
407
408    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
409    a_gpu = cl_array.to_device(queue, a)
410    a_added = (a_gpu + a_gpu).get()
411
412    assert (a + a == a_added).all()
413
414
415def test_addition_scalar(ctx_factory):
416    """Test the addition of an array and a scalar."""
417
418    context = ctx_factory()
419    queue = cl.CommandQueue(context)
420
421    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
422    a_gpu = cl_array.to_device(queue, a)
423    a_added = (7 + a_gpu).get()
424
425    assert (7 + a == a_added).all()
426
427
428def test_substract_array(ctx_factory):
429    """Test the substraction of two arrays."""
430    #test data
431    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
432    b = np.array([10, 20, 30, 40, 50,
433                  60, 70, 80, 90, 100]).astype(np.float32)
434
435    context = ctx_factory()
436    queue = cl.CommandQueue(context)
437
438    a_gpu = cl_array.to_device(queue, a)
439    b_gpu = cl_array.to_device(queue, b)
440
441    result = (a_gpu - b_gpu).get()
442    assert (a - b == result).all()
443
444    result = (b_gpu - a_gpu).get()
445    assert (b - a == result).all()
446
447
448def test_substract_scalar(ctx_factory):
449    """Test the substraction of an array and a scalar."""
450
451    context = ctx_factory()
452    queue = cl.CommandQueue(context)
453
454    #test data
455    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
456
457    #convert a to a gpu object
458    a_gpu = cl_array.to_device(queue, a)
459
460    result = (a_gpu - 7).get()
461    assert (a - 7 == result).all()
462
463    result = (7 - a_gpu).get()
464    assert (7 - a == result).all()
465
466
467def test_divide_scalar(ctx_factory):
468    """Test the division of an array and a scalar."""
469
470    context = ctx_factory()
471    queue = cl.CommandQueue(context)
472
473    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
474    a_gpu = cl_array.to_device(queue, a)
475
476    result = (a_gpu / 2).get()
477    assert (a / 2 == result).all()
478
479    result = (2 / a_gpu).get()
480    assert (np.abs(2 / a - result) < 1e-5).all()
481
482
483def test_divide_array(ctx_factory):
484    """Test the division of an array and a scalar. """
485
486    context = ctx_factory()
487    queue = cl.CommandQueue(context)
488
489    #test data
490    a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
491    b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32)
492
493    a_gpu = cl_array.to_device(queue, a)
494    b_gpu = cl_array.to_device(queue, b)
495
496    a_divide = (a_gpu / b_gpu).get()
497    assert (np.abs(a / b - a_divide) < 1e-3).all()
498
499    a_divide = (b_gpu / a_gpu).get()
500    assert (np.abs(b / a - a_divide) < 1e-3).all()
501
502
503def test_bitwise(ctx_factory):
504    if _PYPY:
505        pytest.xfail("numpypy: missing bitwise ops")
506
507    context = ctx_factory()
508    queue = cl.CommandQueue(context)
509
510    from itertools import product
511
512    dtypes = [np.dtype(t) for t in (np.int64, np.int32, np.int16, np.int8)]
513
514    from pyopencl.clrandom import rand as clrand
515
516    for a_dtype, b_dtype in product(dtypes, dtypes):
517        ary_len = 16
518
519        np.random.seed(10)
520
521        int32_min = np.iinfo(np.int32).min
522        int32_max = np.iinfo(np.int32).max
523
524        a_dev = clrand(
525            queue, (ary_len,), a=int32_min, b=1+int32_max, dtype=np.int64
526            ).astype(a_dtype)
527        b_dev = clrand(
528            queue, (ary_len,), a=int32_min, b=1+int32_max, dtype=np.int64
529            ).astype(b_dtype)
530
531        a = a_dev.get()
532        b = b_dev.get()
533        s = int((clrand(queue, (), a=int32_min, b=1+int32_max, dtype=np.int64)
534                 .astype(b_dtype).get()))
535
536        import operator as o
537
538        for op in [o.and_, o.or_, o.xor]:
539            res_dev = op(a_dev, b_dev)
540            res = op(a, b)
541
542            assert (res_dev.get() == res).all()
543
544            res_dev = op(a_dev, s)
545            res = op(a, s)
546
547            assert (res_dev.get() == res).all()
548
549            res_dev = op(s, b_dev)
550            res = op(s, b)
551
552            assert (res_dev.get() == res).all()
553
554        for op in [o.iand, o.ior, o.ixor]:
555            res_dev = a_dev.copy()
556            op_res = op(res_dev, b_dev)
557            assert op_res is res_dev
558
559            res = a.copy()
560            op(res, b)
561
562            assert (res_dev.get() == res).all()
563
564            res_dev = a_dev.copy()
565            op_res = op(res_dev, s)
566            assert op_res is res_dev
567            res = a.copy()
568            op(res, s)
569
570            assert (res_dev.get() == res).all()
571
572        # Test unary ~
573        res_dev = ~a_dev
574        res = ~a
575        assert (res_dev.get() == res).all()
576
577# }}}
578
579
580# {{{ RNG
581
582@pytest.mark.parametrize("rng_class",
583        [RanluxGenerator, PhiloxGenerator, ThreefryGenerator])
584@pytest.mark.parametrize("ary_size", [300, 301, 302, 303, 10007, 1000000])
585def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False):
586    context = ctx_factory()
587    queue = cl.CommandQueue(context)
588
589    if has_double_support(context.devices[0]):
590        dtypes = [np.float32, np.float64]
591    else:
592        dtypes = [np.float32]
593
594    if rng_class is RanluxGenerator:
595        gen = rng_class(queue, 5120)
596    else:
597        gen = rng_class(context)
598
599    for dtype in dtypes:
600        print(dtype)
601        ran = cl_array.zeros(queue, ary_size, dtype)
602        gen.fill_uniform(ran)
603
604        if plot_hist:
605            import matplotlib.pyplot as pt
606            pt.hist(ran.get(), 30)
607            pt.show()
608
609        assert (0 <= ran.get()).all()
610        assert (ran.get() <= 1).all()
611
612        if rng_class is RanluxGenerator:
613            gen.synchronize(queue)
614
615        ran = cl_array.zeros(queue, ary_size, dtype)
616        gen.fill_uniform(ran, a=4, b=7)
617        ran_host = ran.get()
618
619        for cond in [4 <= ran_host,  ran_host <= 7]:
620            good = cond.all()
621            if not good:
622                print(np.where(~cond))
623                print(ran_host[~cond])
624            assert good
625
626        ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3)
627
628        if plot_hist:
629            import matplotlib.pyplot as pt
630            pt.hist(ran.get(), 30)
631            pt.show()
632
633
634@pytest.mark.parametrize("dtype", [np.int32, np.int64])
635@pytest.mark.parametrize("rng_class",
636        [RanluxGenerator, PhiloxGenerator, ThreefryGenerator])
637def test_random_int_in_range(ctx_factory, rng_class, dtype, plot_hist=False):
638    context = ctx_factory()
639    queue = cl.CommandQueue(context)
640
641    if rng_class is RanluxGenerator:
642        gen = rng_class(queue, 5120)
643    else:
644        gen = rng_class(context)
645
646    # if (dtype == np.int64
647    #         and context.devices[0].platform.vendor.startswith("Advanced Micro")):
648    #     pytest.xfail("AMD miscompiles 64-bit RNG math")
649
650    ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300).get()
651    assert (200 <= ran).all()
652    assert (ran < 300).all()
653
654    print(np.min(ran), np.max(ran))
655    assert np.max(ran) > 295
656
657    if plot_hist:
658        from matplotlib import pyplot as pt
659        pt.hist(ran)
660        pt.show()
661
662# }}}
663
664
665# {{{ misc
666
667def test_numpy_integer_shape(ctx_factory):
668    try:
669        list(np.int32(17))
670    except Exception:
671        pass
672    else:
673        from pytest import skip
674        skip("numpy implementation does not handle scalar correctly.")
675    context = ctx_factory()
676    queue = cl.CommandQueue(context)
677
678    cl_array.empty(queue, np.int32(17), np.float32)
679    cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32)
680
681
682def test_len(ctx_factory):
683    context = ctx_factory()
684    queue = cl.CommandQueue(context)
685
686    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
687    a_cpu = cl_array.to_device(queue, a)
688    assert len(a_cpu) == 10
689
690
691def test_stride_preservation(ctx_factory):
692    if _PYPY:
693        pytest.xfail("numpypy: no array creation from __array_interface__")
694
695    context = ctx_factory()
696    queue = cl.CommandQueue(context)
697
698    a = np.random.rand(3, 3)
699    at = a.T
700    print(at.flags.f_contiguous, at.flags.c_contiguous)
701    at_gpu = cl_array.to_device(queue, at)
702    print(at_gpu.flags.f_contiguous, at_gpu.flags.c_contiguous)
703    assert np.allclose(at_gpu.get(), at)
704
705
706def test_nan_arithmetic(ctx_factory):
707    context = ctx_factory()
708    queue = cl.CommandQueue(context)
709
710    def make_nan_contaminated_vector(size):
711        shape = (size,)
712        a = np.random.randn(*shape).astype(np.float32)
713        from random import randrange
714        for i in range(size // 10):
715            a[randrange(0, size)] = float('nan')
716        return a
717
718    size = 1 << 20
719
720    a = make_nan_contaminated_vector(size)
721    a_gpu = cl_array.to_device(queue, a)
722    b = make_nan_contaminated_vector(size)
723    b_gpu = cl_array.to_device(queue, b)
724
725    ab = a * b
726    ab_gpu = (a_gpu * b_gpu).get()
727
728    assert (np.isnan(ab) == np.isnan(ab_gpu)).all()
729
730
731def test_mem_pool_with_arrays(ctx_factory):
732    context = ctx_factory()
733    queue = cl.CommandQueue(context)
734    mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))
735
736    a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool)
737    b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000
738
739    assert a_dev.allocator is mem_pool
740    assert b_dev.allocator is mem_pool
741
742
743def test_view(ctx_factory):
744    context = ctx_factory()
745    queue = cl.CommandQueue(context)
746
747    a = np.arange(128).reshape(8, 16).astype(np.float32)
748    a_dev = cl_array.to_device(queue, a)
749
750    # same dtype
751    view = a_dev.view()
752    assert view.shape == a_dev.shape and view.dtype == a_dev.dtype
753
754    # larger dtype
755    view = a_dev.view(np.complex64)
756    assert view.shape == (8, 8) and view.dtype == np.complex64
757
758    # smaller dtype
759    view = a_dev.view(np.int16)
760    assert view.shape == (8, 32) and view.dtype == np.int16
761
762
763def test_diff(ctx_factory):
764    context = ctx_factory()
765    queue = cl.CommandQueue(context)
766
767    from pyopencl.clrandom import rand as clrand
768
769    ary_len = 20000
770    a_dev = clrand(queue, (ary_len,), dtype=np.float32)
771    a = a_dev.get()
772
773    err = la.norm(
774            (cl.array.diff(a_dev).get() - np.diff(a)))
775    assert err < 1e-4
776
777
778def test_copy(ctx_factory):
779    context = ctx_factory()
780    queue1 = cl.CommandQueue(context)
781    queue2 = cl.CommandQueue(context)
782
783    # Test copy
784
785    arr = cl.array.zeros(queue1, 100, np.int32)
786    arr_copy = arr.copy()
787
788    assert (arr == arr_copy).all().get()
789    assert arr.data != arr_copy.data
790    assert arr_copy.queue is queue1
791
792    # Test queue association
793
794    arr_copy = arr.copy(queue=queue2)
795    assert arr_copy.queue is queue2
796
797    arr_copy = arr.copy(queue=None)
798    assert arr_copy.queue is None
799
800    arr_copy = arr.with_queue(None).copy(queue=queue1)
801    assert arr_copy.queue is queue1
802
803# }}}
804
805
806# {{{ slices, concatenation
807
808def test_slice(ctx_factory):
809    if _PYPY:
810        pytest.xfail("numpypy: spurious as_strided failure")
811
812    context = ctx_factory()
813    queue = cl.CommandQueue(context)
814
815    from pyopencl.clrandom import rand as clrand
816
817    tp = np.float32
818
819    ary_len = 20000
820    a_gpu = clrand(queue, (ary_len,), dtype=tp)
821    b_gpu = clrand(queue, (ary_len,), dtype=tp)
822    a = a_gpu.get()
823    b = b_gpu.get()
824
825    from random import randrange
826    for i in range(20):
827        start = randrange(ary_len)
828        end = randrange(start, ary_len)
829
830        a_gpu_slice = tp(2)*a_gpu[start:end]
831        a_slice = tp(2)*a[start:end]
832
833        assert la.norm(a_gpu_slice.get() - a_slice) == 0
834
835    for i in range(20):
836        start = randrange(ary_len)
837        end = randrange(start, ary_len)
838
839        a_gpu[start:end] = tp(2)*b[start:end]
840        a[start:end] = tp(2)*b[start:end]
841
842        assert la.norm(a_gpu.get() - a) == 0
843
844    for i in range(20):
845        start = randrange(ary_len)
846        end = randrange(start, ary_len)
847
848        a_gpu[start:end] = tp(2)*b_gpu[start:end]
849        a[start:end] = tp(2)*b[start:end]
850
851        assert la.norm(a_gpu.get() - a) == 0
852
853
854def test_concatenate(ctx_factory):
855    context = ctx_factory()
856    queue = cl.CommandQueue(context)
857
858    from pyopencl.clrandom import rand as clrand
859
860    a_dev = clrand(queue, (5, 15, 20), dtype=np.float32)
861    b_dev = clrand(queue, (4, 15, 20), dtype=np.float32)
862    c_dev = clrand(queue, (3, 15, 20), dtype=np.float32)
863    a = a_dev.get()
864    b = b_dev.get()
865    c = c_dev.get()
866
867    cat_dev = cl.array.concatenate((a_dev, b_dev, c_dev))
868    cat = np.concatenate((a, b, c))
869
870    assert la.norm(cat - cat_dev.get()) == 0
871
872# }}}
873
874
875# {{{ conditionals, any, all
876
877def test_comparisons(ctx_factory):
878    context = ctx_factory()
879    queue = cl.CommandQueue(context)
880
881    from pyopencl.clrandom import rand as clrand
882
883    ary_len = 20000
884    a_dev = clrand(queue, (ary_len,), dtype=np.float32)
885    b_dev = clrand(queue, (ary_len,), dtype=np.float32)
886
887    a = a_dev.get()
888    b = b_dev.get()
889
890    import operator as o
891    for op in [o.eq, o.ne, o.le, o.lt, o.ge, o.gt]:
892        res_dev = op(a_dev, b_dev)
893        res = op(a, b)
894
895        assert (res_dev.get() == res).all()
896
897        res_dev = op(a_dev, 0)
898        res = op(a, 0)
899
900        assert (res_dev.get() == res).all()
901
902        res_dev = op(0, b_dev)
903        res = op(0, b)
904
905        assert (res_dev.get() == res).all()
906
907
908def test_any_all(ctx_factory):
909    context = ctx_factory()
910    queue = cl.CommandQueue(context)
911
912    ary_len = 20000
913    a_dev = cl_array.zeros(queue, (ary_len,), dtype=np.int8)
914
915    assert not a_dev.all().get()
916    assert not a_dev.any().get()
917
918    a_dev[15213] = 1
919
920    assert not a_dev.all().get()
921    assert a_dev.any().get()
922
923    a_dev.fill(1)
924
925    assert a_dev.all().get()
926    assert a_dev.any().get()
927
928# }}}
929
930
931def test_map_to_host(ctx_factory):
932    if _PYPY:
933        pytest.skip("numpypy: no array creation from __array_interface__")
934
935    context = ctx_factory()
936    queue = cl.CommandQueue(context)
937
938    if context.devices[0].type & cl.device_type.GPU:
939        mf = cl.mem_flags
940        allocator = cl_tools.DeferredAllocator(
941                context, mf.READ_WRITE | mf.ALLOC_HOST_PTR)
942    else:
943        allocator = None
944
945    a_dev = cl_array.zeros(queue, (5, 6, 7,), dtype=np.float32, allocator=allocator)
946    a_dev[3, 2, 1] = 10
947    a_host = a_dev.map_to_host()
948    a_host[1, 2, 3] = 10
949
950    a_host_saved = a_host.copy()
951    a_host.base.release(queue)
952
953    a_dev.finish()
954
955    print("DEV[HOST_WRITE]", a_dev.get()[1, 2, 3])
956    print("HOST[DEV_WRITE]", a_host_saved[3, 2, 1])
957
958    assert (a_host_saved == a_dev.get()).all()
959
960
961def test_view_and_strides(ctx_factory):
962    if _PYPY:
963        pytest.xfail("numpypy: no array creation from __array_interface__")
964    return
965
966    context = ctx_factory()
967    queue = cl.CommandQueue(context)
968
969    from pyopencl.clrandom import rand as clrand
970
971    x = clrand(queue, (5, 10), dtype=np.float32)
972    y = x[:3, :5]
973    yv = y.view()
974
975    assert yv.shape == y.shape
976    assert yv.strides == y.strides
977
978    with pytest.raises(AssertionError):
979        assert (yv.get() == x.get()[:3, :5]).all()
980
981
982def test_meshmode_view(ctx_factory):
983    if _PYPY:
984        # https://bitbucket.org/pypy/numpy/issue/28/indexerror-on-ellipsis-slice
985        pytest.xfail("numpypy bug #28")
986
987    context = ctx_factory()
988    queue = cl.CommandQueue(context)
989
990    n = 2
991    result = cl.array.empty(queue, (2, n*6), np.float32)
992
993    def view(z):
994        return z[..., n*3:n*6].reshape(z.shape[:-1] + (n, 3))
995
996    result = result.with_queue(queue)
997    result.fill(0)
998    view(result)[0].fill(1)
999    view(result)[1].fill(1)
1000    x = result.get()
1001    assert (view(x) == 1).all()
1002
1003
1004def test_event_management(ctx_factory):
1005    context = ctx_factory()
1006    queue = cl.CommandQueue(context)
1007
1008    from pyopencl.clrandom import rand as clrand
1009
1010    x = clrand(queue, (5, 10), dtype=np.float32)
1011    assert len(x.events) == 1, len(x._events)
1012
1013    x.finish()
1014
1015    assert len(x.events) == 0
1016
1017    y = x+x
1018    assert len(y.events) == 1
1019    y = x*x
1020    assert len(y.events) == 1
1021    y = 2*x
1022    assert len(y.events) == 1
1023    y = 2/x
1024    assert len(y.events) == 1
1025    y = x/2
1026    assert len(y.events) == 1
1027    y = x**2
1028    assert len(y.events) == 1
1029    y = 2**x
1030    assert len(y.events) == 1
1031
1032    for i in range(10):
1033        x.fill(0)
1034
1035    assert len(x.events) == 10
1036
1037    for i in range(1000):
1038        x.fill(0)
1039
1040    assert len(x.events) < 100
1041
1042
1043def test_reshape(ctx_factory):
1044    context = ctx_factory()
1045    queue = cl.CommandQueue(context)
1046
1047    a = np.arange(128).reshape(8, 16).astype(np.float32)
1048    a_dev = cl_array.to_device(queue, a)
1049
1050    # different ways to specify the shape
1051    a_dev.reshape(4, 32)
1052    a_dev.reshape((4, 32))
1053    a_dev.reshape([4, 32])
1054
1055    # using -1 as unknown dimension
1056    assert a_dev.reshape(-1, 32).shape == (4, 32)
1057    assert a_dev.reshape((32, -1)).shape == (32, 4)
1058    assert a_dev.reshape(((8, -1, 4))).shape == (8, 4, 4)
1059
1060    import pytest
1061    with pytest.raises(ValueError):
1062        a_dev.reshape(-1, -1, 4)
1063
1064
1065def test_skip_slicing(ctx_factory):
1066    context = ctx_factory()
1067    queue = cl.CommandQueue(context)
1068
1069    a_host = np.arange(16).reshape((4, 4))
1070    b_host = a_host[::3]
1071
1072    a = cl_array.to_device(queue, a_host)
1073    b = a[::3]
1074    assert b.shape == b_host.shape
1075    assert np.array_equal(b[1].get(), b_host[1])
1076
1077
1078def test_transpose(ctx_factory):
1079    if _PYPY:
1080        pytest.xfail("numpypy: no array creation from __array_interface__")
1081
1082    context = ctx_factory()
1083    queue = cl.CommandQueue(context)
1084
1085    from pyopencl.clrandom import rand as clrand
1086
1087    a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32)
1088    a = a_gpu.get()
1089
1090    # FIXME: not contiguous
1091    #assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0)))
1092    assert np.array_equal(a_gpu.T.get(), a.T)
1093
1094
1095def test_newaxis(ctx_factory):
1096    context = ctx_factory()
1097    queue = cl.CommandQueue(context)
1098
1099    from pyopencl.clrandom import rand as clrand
1100
1101    a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32)
1102    a = a_gpu.get()
1103
1104    b_gpu = a_gpu[:, np.newaxis]
1105    b = a[:, np.newaxis]
1106
1107    assert b_gpu.shape == b.shape
1108    for i in range(b.ndim):
1109        if b.shape[i] > 1:
1110            assert b_gpu.strides[i] == b.strides[i]
1111
1112
1113def test_squeeze(ctx_factory):
1114    context = ctx_factory()
1115    queue = cl.CommandQueue(context)
1116
1117    shape = (40, 2, 5, 100)
1118    a_cpu = np.random.random(size=shape)
1119    a_gpu = cl_array.to_device(queue, a_cpu)
1120
1121    # Slice with length 1 on dimensions 0 and 1
1122    a_gpu_slice = a_gpu[0:1, 1:2, :, :]
1123    assert a_gpu_slice.shape == (1, 1, shape[2], shape[3])
1124    assert a_gpu_slice.flags.c_contiguous
1125
1126    # Squeeze it and obtain contiguity
1127    a_gpu_squeezed_slice = a_gpu[0:1, 1:2, :, :].squeeze()
1128    assert a_gpu_squeezed_slice.shape == (shape[2], shape[3])
1129    assert a_gpu_squeezed_slice.flags.c_contiguous
1130
1131    # Check that we get the original values out
1132    #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel())
1133
1134    # Slice with length 1 on dimensions 2
1135    a_gpu_slice = a_gpu[:, :, 2:3, :]
1136    assert a_gpu_slice.shape == (shape[0], shape[1], 1, shape[3])
1137    assert not a_gpu_slice.flags.c_contiguous
1138
1139    # Squeeze it, but no contiguity here
1140    a_gpu_squeezed_slice = a_gpu[:, :, 2:3, :].squeeze()
1141    assert a_gpu_squeezed_slice.shape == (shape[0], shape[1], shape[3])
1142    assert not a_gpu_squeezed_slice.flags.c_contiguous
1143
1144    # Check that we get the original values out
1145    #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel())
1146
1147
1148def test_fancy_fill(ctx_factory):
1149    if _PYPY:
1150        pytest.xfail("numpypy: multi value setting is not supported")
1151    context = ctx_factory()
1152    queue = cl.CommandQueue(context)
1153
1154    numpy_dest = np.zeros((4,), np.int32)
1155    numpy_idx = np.arange(3, dtype=np.int32)
1156    numpy_src = np.arange(8, 9, dtype=np.int32)
1157    numpy_dest[numpy_idx] = numpy_src
1158
1159    cl_dest = cl_array.zeros(queue, (4,), np.int32)
1160    cl_idx = cl_array.arange(queue, 3, dtype=np.int32)
1161    cl_src = cl_array.arange(queue, 8, 9, dtype=np.int32)
1162    cl_dest[cl_idx] = cl_src
1163
1164    assert np.all(numpy_dest == cl_dest.get())
1165
1166
1167def test_fancy_indexing(ctx_factory):
1168    if _PYPY:
1169        pytest.xfail("numpypy: multi value setting is not supported")
1170    context = ctx_factory()
1171    queue = cl.CommandQueue(context)
1172
1173    numpy_dest = np.zeros((4,), np.int32)
1174    numpy_idx = np.arange(3, 0, -1, dtype=np.int32)
1175    numpy_src = np.arange(8, 11, dtype=np.int32)
1176    numpy_dest[numpy_idx] = numpy_src
1177
1178    cl_dest = cl_array.zeros(queue, (4,), np.int32)
1179    cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32)
1180    cl_src = cl_array.arange(queue, 8, 11, dtype=np.int32)
1181    cl_dest[cl_idx] = cl_src
1182
1183    assert np.all(numpy_dest == cl_dest.get())
1184
1185    cl_idx[1] = 3
1186    cl_idx[2] = 2
1187
1188    numpy_idx[1] = 3
1189    numpy_idx[2] = 2
1190
1191    numpy_dest[numpy_idx] = numpy_src
1192    cl_dest[cl_idx] = cl_src
1193
1194    assert np.all(numpy_dest == cl_dest.get())
1195
1196
1197def test_multi_put(ctx_factory):
1198    if _PYPY:
1199        pytest.xfail("numpypy: multi value setting is not supported")
1200
1201    context = ctx_factory()
1202    queue = cl.CommandQueue(context)
1203
1204    cl_arrays = [
1205        cl_array.arange(queue, 0, 3, dtype=np.float32)
1206        for i in range(1, 10)
1207    ]
1208    idx = cl_array.arange(queue, 0, 6, dtype=np.int32)
1209    out_arrays = [
1210        cl_array.zeros(queue, (10,), np.float32)
1211        for i in range(9)
1212    ]
1213
1214    out_compare = [np.zeros((10,), np.float32) for i in range(9)]
1215    for i, ary in enumerate(out_compare):
1216        ary[idx.get()] = np.arange(0, 6, dtype=np.float32)
1217
1218    cl_array.multi_put(cl_arrays, idx, out=out_arrays)
1219
1220    assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
1221
1222
1223def test_outoforderqueue_get(ctx_factory):
1224    context = ctx_factory()
1225    try:
1226        queue = cl.CommandQueue(context,
1227               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
1228    except Exception:
1229        pytest.skip("out-of-order queue not available")
1230    a = np.random.rand(10**6).astype(np.dtype('float32'))
1231    a_gpu = cl_array.to_device(queue, a)
1232    b_gpu = a_gpu + a_gpu**5 + 1
1233    b1 = b_gpu.get()  # testing that this waits for events
1234    b = a + a**5 + 1
1235    assert np.abs(b1 - b).mean() < 1e-5
1236
1237
1238def test_outoforderqueue_copy(ctx_factory):
1239    context = ctx_factory()
1240    try:
1241        queue = cl.CommandQueue(context,
1242               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
1243    except Exception:
1244        pytest.skip("out-of-order queue not available")
1245    a = np.random.rand(10**6).astype(np.dtype('float32'))
1246    a_gpu = cl_array.to_device(queue, a)
1247    c_gpu = a_gpu**2 - 7
1248    b_gpu = c_gpu.copy()  # testing that this waits for and creates events
1249    b_gpu *= 10
1250    queue.finish()
1251    b1 = b_gpu.get()
1252    b = 10 * (a**2 - 7)
1253    assert np.abs(b1 - b).mean() < 1e-5
1254
1255
1256def test_outoforderqueue_indexing(ctx_factory):
1257    context = ctx_factory()
1258    try:
1259        queue = cl.CommandQueue(context,
1260               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
1261    except Exception:
1262        pytest.skip("out-of-order queue not available")
1263    a = np.random.rand(10**6).astype(np.dtype('float32'))
1264    i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype('int32'))
1265    a_gpu = cl_array.to_device(queue, a)
1266    i_gpu = cl_array.to_device(queue, i)
1267    c_gpu = (a_gpu**2)[i_gpu - 10000]
1268    b_gpu = 10 - a_gpu
1269    b_gpu[:] = 8 * a_gpu
1270    b_gpu[i_gpu + 10000] = c_gpu - 10
1271    queue.finish()
1272    b1 = b_gpu.get()
1273    c = (a**2)[i - 10000]
1274    b = 8 * a
1275    b[i + 10000] = c - 10
1276    assert np.abs(b1 - b).mean() < 1e-5
1277
1278
1279def test_outoforderqueue_reductions(ctx_factory):
1280    context = ctx_factory()
1281    try:
1282        queue = cl.CommandQueue(context,
1283               properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
1284    except Exception:
1285        pytest.skip("out-of-order queue not available")
1286    # 0/1 values to avoid accumulated rounding error
1287    a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32'))
1288    a[800000] = 10  # all<5 looks true until near the end
1289    a_gpu = cl_array.to_device(queue, a)
1290    b1 = cl_array.sum(a_gpu).get()
1291    b2 = cl_array.dot(a_gpu, 3 - a_gpu).get()
1292    b3 = (a_gpu < 5).all().get()
1293    assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0
1294
1295
1296if __name__ == "__main__":
1297    # make sure that import failures get reported, instead of skipping the
1298    # tests.
1299    if len(sys.argv) > 1:
1300        exec(sys.argv[1])
1301    else:
1302        from pytest import main
1303        main([__file__])
1304
1305# vim: filetype=pyopencl:fdm=marker
1306