1#! /usr/bin/env python
2
3from __future__ import division, with_statement, absolute_import, print_function
4
5__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
6
7__license__ = """
8Permission is hereby granted, free of charge, to any person obtaining a copy
9of this software and associated documentation files (the "Software"), to deal
10in the Software without restriction, including without limitation the rights
11to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12copies of the Software, and to permit persons to whom the Software is
13furnished to do so, subject to the following conditions:
14
15The above copyright notice and this permission notice shall be included in
16all copies or substantial portions of the Software.
17
18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24THE SOFTWARE.
25"""
26
27from six.moves import range, zip
28import numpy as np
29import numpy.linalg as la
30import sys
31from pytools import memoize
32from test_array import general_clrand
33
34import pytest
35
36import pyopencl as cl
37import pyopencl.array as cl_array  # noqa
38from pyopencl.tools import (  # noqa
39        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
40from pyopencl.characterize import has_double_support, has_struct_arg_count_bug
41from pyopencl.scan import (InclusiveScanKernel, ExclusiveScanKernel,
42        GenericScanKernel, GenericDebugScanKernel)
43from pyopencl.characterize import get_pocl_version
44
45
46# {{{ elementwise
47
48def test_elwise_kernel(ctx_factory):
49    context = ctx_factory()
50    queue = cl.CommandQueue(context)
51
52    from pyopencl.clrandom import rand as clrand
53
54    a_gpu = clrand(queue, (50,), np.float32)
55    b_gpu = clrand(queue, (50,), np.float32)
56
57    from pyopencl.elementwise import ElementwiseKernel
58    lin_comb = ElementwiseKernel(context,
59            "float a, float *x, float b, float *y, float *z",
60            "z[i] = a*x[i] + b*y[i]",
61            "linear_combination")
62
63    c_gpu = cl_array.empty_like(a_gpu)
64    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
65
66    assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
67
68
69def test_elwise_kernel_with_options(ctx_factory):
70    from pyopencl.clrandom import rand as clrand
71    from pyopencl.elementwise import ElementwiseKernel
72
73    context = ctx_factory()
74    queue = cl.CommandQueue(context)
75
76    in_gpu = clrand(queue, (50,), np.float32)
77
78    options = ['-D', 'ADD_ONE']
79    add_one = ElementwiseKernel(
80        context,
81        "float* out, const float *in",
82        """
83        out[i] = in[i]
84        #ifdef ADD_ONE
85            +1
86        #endif
87        ;
88        """,
89        options=options,
90        )
91
92    out_gpu = cl_array.empty_like(in_gpu)
93    add_one(out_gpu, in_gpu)
94
95    gt = in_gpu.get() + 1
96    gv = out_gpu.get()
97    assert la.norm(gv - gt) < 1e-5
98
99
100def test_ranged_elwise_kernel(ctx_factory):
101    context = ctx_factory()
102    queue = cl.CommandQueue(context)
103
104    from pyopencl.elementwise import ElementwiseKernel
105    set_to_seven = ElementwiseKernel(context,
106            "float *z", "z[i] = 7", "set_to_seven")
107
108    for i, slc in enumerate([
109            slice(5, 20000),
110            slice(5, 20000, 17),
111            slice(3000, 5, -1),
112            slice(1000, -1),
113            ]):
114
115        a_gpu = cl_array.zeros(queue, (50000,), dtype=np.float32)
116        a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype)
117
118        a_cpu[slc] = 7
119        set_to_seven(a_gpu, slice=slc)
120
121        assert (a_cpu == a_gpu.get()).all()
122
123
124def test_take(ctx_factory):
125    context = ctx_factory()
126    queue = cl.CommandQueue(context)
127
128    idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32)
129    a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32)
130    result = cl_array.take(a, idx)
131    assert ((3 * idx).get() == result.get()).all()
132
133
134def test_arange(ctx_factory):
135    context = ctx_factory()
136    queue = cl.CommandQueue(context)
137
138    n = 5000
139    a = cl_array.arange(queue, n, dtype=np.float32)
140    assert (np.arange(n, dtype=np.float32) == a.get()).all()
141
142
143def test_reverse(ctx_factory):
144    context = ctx_factory()
145    queue = cl.CommandQueue(context)
146
147    n = 5000
148    a = np.arange(n).astype(np.float32)
149    a_gpu = cl_array.to_device(queue, a)
150
151    a_gpu = a_gpu.reverse()
152
153    assert (a[::-1] == a_gpu.get()).all()
154
155
156def test_if_positive(ctx_factory):
157    context = ctx_factory()
158    queue = cl.CommandQueue(context)
159
160    from pyopencl.clrandom import rand as clrand
161
162    ary_len = 20000
163    a_gpu = clrand(queue, (ary_len,), np.float32)
164    b_gpu = clrand(queue, (ary_len,), np.float32)
165    a = a_gpu.get()
166    b = b_gpu.get()
167
168    max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu)
169    min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu)
170
171    print(max_a_b_gpu)
172    print(np.maximum(a, b))
173
174    assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
175    assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
176
177
178def test_take_put(ctx_factory):
179    context = ctx_factory()
180    queue = cl.CommandQueue(context)
181
182    for n in [5, 17, 333]:
183        one_field_size = 8
184        buf_gpu = cl_array.zeros(queue,
185                n * one_field_size, dtype=np.float32)
186        dest_indices = cl_array.to_device(queue,
187                np.array([0, 1, 2,  3, 32, 33, 34, 35], dtype=np.uint32))
188        read_map = cl_array.to_device(queue,
189                np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
190
191        cl_array.multi_take_put(
192                arrays=[buf_gpu for i in range(n)],
193                dest_indices=dest_indices,
194                src_indices=read_map,
195                src_offsets=[i * one_field_size for i in range(n)],
196                dest_shape=(96,))
197
198
199def test_astype(ctx_factory):
200    context = ctx_factory()
201    queue = cl.CommandQueue(context)
202
203    from pyopencl.clrandom import rand as clrand
204
205    if not has_double_support(context.devices[0]):
206        from pytest import skip
207        skip("double precision not supported on %s" % context.devices[0])
208
209    a_gpu = clrand(queue, (2000,), dtype=np.float32)
210
211    a = a_gpu.get().astype(np.float64)
212    a2 = a_gpu.astype(np.float64).get()
213
214    assert a2.dtype == np.float64
215    assert la.norm(a - a2) == 0, (a, a2)
216
217    a_gpu = clrand(queue, (2000,), dtype=np.float64)
218
219    a = a_gpu.get().astype(np.float32)
220    a2 = a_gpu.astype(np.float32).get()
221
222    assert a2.dtype == np.float32
223    assert la.norm(a - a2) / la.norm(a) < 1e-7
224
225# }}}
226
227
228# {{{ reduction
229
230def test_sum(ctx_factory):
231    from pytest import importorskip
232    importorskip("mako")
233
234    context = ctx_factory()
235    queue = cl.CommandQueue(context)
236
237    n = 200000
238    for dtype in [np.float32, np.complex64]:
239        a_gpu = general_clrand(queue, (n,), dtype)
240
241        a = a_gpu.get()
242
243        for slc in [
244                slice(None),
245                slice(1000, 3000),
246                slice(1000, -3000),
247                slice(1000, None),
248                slice(1000, None, 3),
249                ]:
250            sum_a = np.sum(a[slc])
251
252            if slc.step is None:
253                sum_a_gpu = cl_array.sum(a_gpu[slc]).get()
254                assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
255
256            sum_a_gpu_2 = cl_array.sum(a_gpu, slice=slc).get()
257            assert abs(sum_a_gpu_2 - sum_a) / abs(sum_a) < 1e-4
258
259
260def test_sum_without_data(ctx_factory):
261    from pytest import importorskip
262    importorskip("mako")
263
264    context = ctx_factory()
265    queue = cl.CommandQueue(context)
266
267    n = 2000
268
269    from pyopencl.reduction import ReductionKernel
270    red = ReductionKernel(context, np.int32,
271            neutral="0",
272            reduce_expr="a+b", map_expr="i",
273            arguments=[])
274
275    result_dev = red(range=slice(n), queue=queue).get()
276    result_ref = n*(n-1)//2
277
278    assert result_dev == result_ref
279
280
281def test_minmax(ctx_factory):
282    from pytest import importorskip
283    importorskip("mako")
284
285    context = ctx_factory()
286    queue = cl.CommandQueue(context)
287
288    from pyopencl.clrandom import rand as clrand
289
290    if has_double_support(context.devices[0]):
291        dtypes = [np.float64, np.float32, np.int32]
292    else:
293        dtypes = [np.float32, np.int32]
294
295    for what in ["min", "max"]:
296        for dtype in dtypes:
297            a_gpu = clrand(queue, (200000,), dtype)
298            a = a_gpu.get()
299
300            op_a = getattr(np, what)(a)
301            op_a_gpu = getattr(cl_array, what)(a_gpu).get()
302
303            assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
304
305
306def test_subset_minmax(ctx_factory):
307    from pytest import importorskip
308    importorskip("mako")
309
310    context = ctx_factory()
311    queue = cl.CommandQueue(context)
312
313    from pyopencl.clrandom import rand as clrand
314
315    l_a = 200000
316    gran = 5
317    l_m = l_a - l_a // gran + 1
318
319    if has_double_support(context.devices[0]):
320        dtypes = [np.float64, np.float32, np.int32]
321    else:
322        dtypes = [np.float32, np.int32]
323
324    for dtype in dtypes:
325        a_gpu = clrand(queue, (l_a,), dtype)
326        a = a_gpu.get()
327
328        meaningful_indices_gpu = cl_array.zeros(
329                queue, l_m, dtype=np.int32)
330        meaningful_indices = meaningful_indices_gpu.get()
331        j = 0
332        for i in range(len(meaningful_indices)):
333            meaningful_indices[i] = j
334            j = j + 1
335            if j % gran == 0:
336                j = j + 1
337
338        meaningful_indices_gpu = cl_array.to_device(
339                queue, meaningful_indices)
340        b = a[meaningful_indices]
341
342        min_a = np.min(b)
343        min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()
344
345        assert min_a_gpu == min_a
346
347
348def test_dot(ctx_factory):
349    from pytest import importorskip
350    importorskip("mako")
351
352    context = ctx_factory()
353    queue = cl.CommandQueue(context)
354
355    dev = context.devices[0]
356
357    dtypes = [np.float32, np.complex64]
358    if has_double_support(dev):
359        if has_struct_arg_count_bug(dev) == "apple":
360            dtypes.extend([np.float64])
361        else:
362            dtypes.extend([np.float64, np.complex128])
363
364    for a_dtype in dtypes:
365        for b_dtype in dtypes:
366            print(a_dtype, b_dtype)
367            a_gpu = general_clrand(queue, (200000,), a_dtype)
368            a = a_gpu.get()
369            b_gpu = general_clrand(queue, (200000,), b_dtype)
370            b = b_gpu.get()
371
372            dot_ab = np.dot(a, b)
373            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
374
375            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
376
377            try:
378                vdot_ab = np.vdot(a, b)
379            except NotImplementedError:
380                import sys
381                is_pypy = '__pypy__' in sys.builtin_module_names
382                if is_pypy:
383                    print("PYPY: VDOT UNIMPLEMENTED")
384                    continue
385                else:
386                    raise
387
388            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()
389
390            rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
391            assert rel_err < 1e-4, rel_err
392
393
394@memoize
395def make_mmc_dtype(device):
396    dtype = np.dtype([
397        ("cur_min", np.int32),
398        ("cur_max", np.int32),
399        ("pad", np.int32),
400        ])
401
402    name = "minmax_collector"
403    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
404
405    dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
406    dtype = get_or_register_dtype(name, dtype)
407
408    return dtype, c_decl
409
410
411def test_struct_reduce(ctx_factory):
412    pytest.importorskip("mako")
413
414    context = ctx_factory()
415    queue = cl.CommandQueue(context)
416
417    dev, = context.devices
418    if (dev.vendor == "NVIDIA" and dev.platform.vendor == "Apple"
419            and dev.driver_version == "8.12.47 310.40.00.05f01"):
420        pytest.skip("causes a compiler hang on Apple/Nv GPU")
421
422    mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0])
423
424    preamble = mmc_c_decl + r"""//CL//
425
426    minmax_collector mmc_neutral()
427    {
428        // FIXME: needs infinity literal in real use, ok here
429        minmax_collector result;
430        result.cur_min = 1<<30;
431        result.cur_max = -(1<<30);
432        return result;
433    }
434
435    minmax_collector mmc_from_scalar(float x)
436    {
437        minmax_collector result;
438        result.cur_min = x;
439        result.cur_max = x;
440        return result;
441    }
442
443    minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
444    {
445        minmax_collector result = a;
446        if (b.cur_min < result.cur_min)
447            result.cur_min = b.cur_min;
448        if (b.cur_max > result.cur_max)
449            result.cur_max = b.cur_max;
450        return result;
451    }
452
453    """
454
455    from pyopencl.clrandom import rand as clrand
456    a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6)
457    a = a_gpu.get()
458
459    from pyopencl.reduction import ReductionKernel
460    red = ReductionKernel(context, mmc_dtype,
461            neutral="mmc_neutral()",
462            reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])",
463            arguments="__global int *x", preamble=preamble)
464
465    minmax = red(a_gpu).get()
466    #print minmax["cur_min"], minmax["cur_max"]
467    #print np.min(a), np.max(a)
468
469    assert abs(minmax["cur_min"] - np.min(a)) < 1e-5
470    assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
471
472# }}}
473
474
475# {{{ scan-related
476
477def summarize_error(obtained, desired, orig, thresh=1e-5):
478    from pytest import importorskip
479    importorskip("mako")
480
481    err = obtained - desired
482    ok_count = 0
483    bad_count = 0
484
485    bad_limit = 200
486
487    def summarize_counts():
488        if ok_count:
489            entries.append("<%d ok>" % ok_count)
490        if bad_count >= bad_limit:
491            entries.append("<%d more bad>" % (bad_count-bad_limit))
492
493    entries = []
494    for i, val in enumerate(err):
495        if abs(val) > thresh:
496            if ok_count:
497                summarize_counts()
498                ok_count = 0
499
500            bad_count += 1
501
502            if bad_count < bad_limit:
503                entries.append("%r (want: %r, got: %r, orig: %r)" % (
504                    obtained[i], desired[i], obtained[i], orig[i]))
505        else:
506            if bad_count:
507                summarize_counts()
508                bad_count = 0
509
510            ok_count += 1
511
512    summarize_counts()
513
514    return " ".join(entries)
515
516
517scan_test_counts = [
518    10,
519    2 ** 8 - 1,
520    2 ** 8,
521    2 ** 8 + 1,
522    2 ** 10 - 5,
523    2 ** 10,
524    2 ** 10 + 5,
525    2 ** 12 - 5,
526    2 ** 12,
527    2 ** 12 + 5,
528    2 ** 20 - 2 ** 18,
529    2 ** 20 - 2 ** 18 + 5,
530    2 ** 20 + 1,
531    2 ** 20,
532    2 ** 23 + 3,
533    # larger sizes cause out of memory on low-end AMD APUs
534    ]
535
536
537@pytest.mark.parametrize("dtype", [np.int32, np.int64])
538@pytest.mark.parametrize("scan_cls", [InclusiveScanKernel, ExclusiveScanKernel])
539def test_scan(ctx_factory, dtype, scan_cls):
540    from pytest import importorskip
541    importorskip("mako")
542
543    context = ctx_factory()
544    queue = cl.CommandQueue(context)
545
546    knl = scan_cls(context, dtype, "a+b", "0")
547
548    for n in scan_test_counts:
549        host_data = np.random.randint(0, 10, n).astype(dtype)
550        dev_data = cl_array.to_device(queue, host_data)
551
552        # /!\ fails on Nv GT2?? for some drivers
553        assert (host_data == dev_data.get()).all()
554
555        knl(dev_data)
556
557        desired_result = np.cumsum(host_data, axis=0)
558        if scan_cls is ExclusiveScanKernel:
559            desired_result -= host_data
560
561        is_ok = (dev_data.get() == desired_result).all()
562        if 1 and not is_ok:
563            print("something went wrong, summarizing error...")
564            print(summarize_error(dev_data.get(), desired_result, host_data))
565
566        print("dtype:%s n:%d %s worked:%s" % (dtype, n, scan_cls, is_ok))
567        assert is_ok
568        from gc import collect
569        collect()
570
571
572@pytest.mark.parametrize("scan_cls", (GenericScanKernel, GenericDebugScanKernel))
573def test_scan_with_vectorargs_with_offsets(ctx_factory, scan_cls):
574    context = ctx_factory()
575    queue = cl.CommandQueue(context)
576
577    from pyopencl.tools import VectorArg
578
579    knl = scan_cls(
580            context, float,
581            arguments=[
582                VectorArg(float, "input", with_offset=True),
583                VectorArg(int, "segment", with_offset=True),
584                ],
585            input_expr="input[i]",
586            is_segment_start_expr="segment[i]",
587            scan_expr="a+b", neutral="0",
588            output_statement="""
589                input[i] = item;
590                """)
591
592    n = 20
593
594    host_data = np.random.randint(0, 10, n).astype(float)
595    dev_data = cl.array.to_device(queue, host_data)
596    segment_data = np.zeros(n, dtype=int)
597    dev_segment_data = cl.array.to_device(queue, segment_data)
598
599    knl(dev_data, dev_segment_data)
600
601    assert (dev_data.get() == np.cumsum(host_data)).all()
602
603
604def test_copy_if(ctx_factory):
605    from pytest import importorskip
606    importorskip("mako")
607
608    context = ctx_factory()
609    queue = cl.CommandQueue(context)
610
611    from pyopencl.clrandom import rand as clrand
612    for n in scan_test_counts:
613        a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
614        a = a_dev.get()
615
616        from pyopencl.algorithm import copy_if
617
618        crit = a_dev.dtype.type(300)
619        selected = a[a > crit]
620        selected_dev, count_dev, evt = copy_if(
621                a_dev, "ary[i] > myval", [("myval", crit)])
622
623        assert (selected_dev.get()[:count_dev.get()] == selected).all()
624        from gc import collect
625        collect()
626
627
628def test_partition(ctx_factory):
629    from pytest import importorskip
630    importorskip("mako")
631
632    context = ctx_factory()
633    queue = cl.CommandQueue(context)
634
635    from pyopencl.clrandom import rand as clrand
636    for n in scan_test_counts:
637        print("part", n)
638
639        a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
640        a = a_dev.get()
641
642        crit = a_dev.dtype.type(300)
643        true_host = a[a > crit]
644        false_host = a[a <= crit]
645
646        from pyopencl.algorithm import partition
647        true_dev, false_dev, count_true_dev, evt = partition(
648                a_dev, "ary[i] > myval", [("myval", crit)])
649
650        count_true_dev = count_true_dev.get()
651
652        assert (true_dev.get()[:count_true_dev] == true_host).all()
653        assert (false_dev.get()[:n-count_true_dev] == false_host).all()
654
655
656def test_unique(ctx_factory):
657    from pytest import importorskip
658    importorskip("mako")
659
660    context = ctx_factory()
661    queue = cl.CommandQueue(context)
662
663    from pyopencl.clrandom import rand as clrand
664    for n in scan_test_counts:
665        a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
666        a = a_dev.get()
667        a = np.sort(a)
668        a_dev = cl_array.to_device(queue, a)
669
670        a_unique_host = np.unique(a)
671
672        from pyopencl.algorithm import unique
673        a_unique_dev, count_unique_dev, evt = unique(a_dev)
674
675        count_unique_dev = count_unique_dev.get()
676
677        assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all()
678        from gc import collect
679        collect()
680
681
682def test_index_preservation(ctx_factory):
683    from pytest import importorskip
684    importorskip("mako")
685
686    context = ctx_factory()
687    queue = cl.CommandQueue(context)
688
689    classes = [GenericScanKernel]
690
691    dev = context.devices[0]
692    if dev.type & cl.device_type.CPU:
693        classes.append(GenericDebugScanKernel)
694
695    for cls in classes:
696        for n in scan_test_counts:
697            knl = cls(
698                    context, np.int32,
699                    arguments="__global int *out",
700                    input_expr="i",
701                    scan_expr="b", neutral="0",
702                    output_statement="""
703                        out[i] = item;
704                        """)
705
706            out = cl_array.empty(queue, n, dtype=np.int32)
707            knl(out)
708
709            assert (out.get() == np.arange(n)).all()
710            from gc import collect
711            collect()
712
713
714def test_segmented_scan(ctx_factory):
715    from pytest import importorskip
716    importorskip("mako")
717
718    context = ctx_factory()
719    queue = cl.CommandQueue(context)
720
721    from pyopencl.tools import dtype_to_ctype
722    dtype = np.int32
723    ctype = dtype_to_ctype(dtype)
724
725    #for is_exclusive in [False, True]:
726    for is_exclusive in [True, False]:
727        if is_exclusive:
728            output_statement = "out[i] = prev_item"
729        else:
730            output_statement = "out[i] = item"
731
732        knl = GenericScanKernel(context, dtype,
733                arguments="__global %s *ary, __global char *segflags, "
734                    "__global %s *out" % (ctype, ctype),
735                input_expr="ary[i]",
736                scan_expr="across_seg_boundary ? b : (a+b)", neutral="0",
737                is_segment_start_expr="segflags[i]",
738                output_statement=output_statement,
739                options=[])
740
741        np.set_printoptions(threshold=2000)
742        from random import randrange
743        from pyopencl.clrandom import rand as clrand
744        for n in scan_test_counts:
745            a_dev = clrand(queue, (n,), dtype=dtype, a=0, b=10)
746            a = a_dev.get()
747
748            if 10 <= n < 20:
749                seg_boundaries_values = [
750                        [0, 9],
751                        [0, 3],
752                        [4, 6],
753                        ]
754            else:
755                seg_boundaries_values = []
756                for i in range(10):
757                    seg_boundary_count = max(2, min(100, randrange(0, int(0.4*n))))
758                    seg_boundaries = [
759                            randrange(n) for i in range(seg_boundary_count)]
760                    if n >= 1029:
761                        seg_boundaries.insert(0, 1028)
762                    seg_boundaries.sort()
763                    seg_boundaries_values.append(seg_boundaries)
764
765            for seg_boundaries in seg_boundaries_values:
766                #print "BOUNDARIES", seg_boundaries
767                #print a
768
769                seg_boundary_flags = np.zeros(n, dtype=np.uint8)
770                seg_boundary_flags[seg_boundaries] = 1
771                seg_boundary_flags_dev = cl_array.to_device(
772                        queue, seg_boundary_flags)
773
774                seg_boundaries.insert(0, 0)
775
776                result_host = a.copy()
777                for i, seg_start in enumerate(seg_boundaries):
778                    if i+1 < len(seg_boundaries):
779                        seg_end = seg_boundaries[i+1]
780                    else:
781                        seg_end = None
782
783                    if is_exclusive:
784                        result_host[seg_start+1:seg_end] = np.cumsum(
785                                a[seg_start:seg_end][:-1])
786                        result_host[seg_start] = 0
787                    else:
788                        result_host[seg_start:seg_end] = np.cumsum(
789                                a[seg_start:seg_end])
790
791                #print "REF", result_host
792
793                result_dev = cl_array.empty_like(a_dev)
794                knl(a_dev, seg_boundary_flags_dev, result_dev)
795
796                #print "RES", result_dev
797                is_correct = (result_dev.get() == result_host).all()
798                if not is_correct:
799                    diff = result_dev.get() - result_host
800                    print("RES-REF", diff)
801                    print("ERRWHERE", np.where(diff))
802                    print(n, list(seg_boundaries))
803
804                assert is_correct
805                from gc import collect
806                collect()
807
808            print("%d excl:%s done" % (n, is_exclusive))
809
810
811@pytest.mark.parametrize("scan_kernel", [GenericScanKernel, GenericDebugScanKernel])
812def test_sort(ctx_factory, scan_kernel):
813    from pytest import importorskip
814    importorskip("mako")
815
816    context = ctx_factory()
817    queue = cl.CommandQueue(context)
818
819    dtype = np.int32
820
821    from pyopencl.algorithm import RadixSort
822    sort = RadixSort(context, "int *ary", key_expr="ary[i]",
823            sort_arg_names=["ary"], scan_kernel=scan_kernel)
824
825    from pyopencl.clrandom import RanluxGenerator
826    rng = RanluxGenerator(queue, seed=15)
827
828    from time import time
829
830    # intermediate arrays for largest size cause out-of-memory on low-end GPUs
831    for n in scan_test_counts[:-1]:
832        if n >= 2000 and isinstance(scan_kernel, GenericDebugScanKernel):
833            continue
834
835        print(n)
836
837        print("  rng")
838        a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16)
839        a = a_dev.get()
840
841        dev_start = time()
842        print("  device")
843        (a_dev_sorted,), evt = sort(a_dev, key_bits=16)
844        queue.finish()
845        dev_end = time()
846        print("  numpy")
847        a_sorted = np.sort(a)
848        numpy_end = time()
849
850        numpy_elapsed = numpy_end-dev_end
851        dev_elapsed = dev_end-dev_start
852        print("  dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (
853                1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed))
854        assert (a_dev_sorted.get() == a_sorted).all()
855
856
857def test_list_builder(ctx_factory):
858    from pytest import importorskip
859    importorskip("mako")
860
861    context = ctx_factory()
862    queue = cl.CommandQueue(context)
863
864    from pyopencl.algorithm import ListOfListsBuilder
865    builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
866            void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
867            {
868                int count = i % 4;
869                for (int j = 0; j < count; ++j)
870                {
871                    APPEND_mylist(count);
872                }
873            }
874            """, arg_decls=[])
875
876    result, evt = builder(queue, 2000)
877
878    inf = result["mylist"]
879    assert inf.count == 3000
880    assert (inf.lists.get()[-6:] == [1, 2, 2, 3, 3, 3]).all()
881
882
883def test_list_builder_with_empty_elim(ctx_factory):
884    from pytest import importorskip
885    importorskip("mako")
886
887    context = ctx_factory()
888    queue = cl.CommandQueue(context)
889
890    from pyopencl.algorithm import ListOfListsBuilder
891
892    builder = ListOfListsBuilder(
893        context,
894        [("mylist1", np.int32), ("mylist2", np.int32), ("mylist3", np.int32)],
895        """//CL//
896            void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
897            {
898                if (i % 5 == 0)
899                {
900                    for (int j = 0; j < i / 5; ++j)
901                    {
902                        APPEND_mylist1(j);
903                        APPEND_mylist2(j + 1);
904                        APPEND_mylist3(j);
905                    }
906                }
907            }
908        """,
909        arg_decls=[],
910        eliminate_empty_output_lists=["mylist1", "mylist2"])
911
912    result, evt = builder(queue, 1000)
913
914    mylist1 = result["mylist1"]
915    assert mylist1.count == 19900
916    assert (mylist1.starts.get()[:5] == [0, 1, 3, 6, 10]).all()
917    assert (mylist1.nonempty_indices.get()[:5] == [5, 10, 15, 20, 25]).all()
918    assert (mylist1.lists.get()[:6] == [0, 0, 1, 0, 1, 2]).all()
919    mylist2 = result["mylist2"]
920    assert mylist2.count == 19900
921    assert (mylist2.lists.get()[:6] == [1, 1, 2, 1, 2, 3]).all()
922    mylist3 = result["mylist3"]
923    assert mylist3.count == 19900
924    assert (mylist3.starts.get()[:10] == [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]).all()
925    assert (mylist3.lists.get()[:6] == [0, 0, 1, 0, 1, 2]).all()
926
927
928def test_key_value_sorter(ctx_factory):
929    from pytest import importorskip
930    importorskip("mako")
931
932    context = ctx_factory()
933    queue = cl.CommandQueue(context)
934
935    n = 10**5
936    nkeys = 2000
937    from pyopencl.clrandom import rand as clrand
938    keys = clrand(queue, n, np.int32, b=nkeys)
939    values = clrand(queue, n, np.int32, b=n).astype(np.int64)
940
941    assert np.max(keys.get()) < nkeys
942
943    from pyopencl.algorithm import KeyValueSorter
944    kvs = KeyValueSorter(context)
945    starts, lists, evt = kvs(queue, keys, values, nkeys, starts_dtype=np.int32)
946
947    starts = starts.get()
948    lists = lists.get()
949
950    mydict = dict()
951    for k, v in zip(keys.get(), values.get()):
952        mydict.setdefault(k, []).append(v)
953
954    for i in range(nkeys):
955        start, end = starts[i:i+2]
956        assert sorted(mydict[i]) == sorted(lists[start:end])
957
958# }}}
959
960
961# {{{ bitonic sort
962
963@pytest.mark.parametrize("size", [
964    512,
965    4,
966    16
967    ])
968@pytest.mark.parametrize("dtype", [
969    np.int32,
970    np.float32,
971    np.float64
972    ])
973@pytest.mark.bitonic
974def test_bitonic_sort(ctx_factory, size, dtype):
975    ctx = cl.create_some_context()
976    queue = cl.CommandQueue(ctx)
977
978    dev = ctx.devices[0]
979    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
980        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
981            "parallelism")
982    if (dev.platform.name == "Portable Computing Language"
983            and dtype == np.float64
984            and get_pocl_version(dev.platform) < (1, 0)):
985        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")
986
987    if dtype == np.float64 and not has_double_support(dev):
988        from pytest import skip
989        skip("double precision not supported on %s" % dev)
990
991    import pyopencl.clrandom as clrandom
992    from pyopencl.bitonic_sort import BitonicSort
993
994    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
995    sgs = s.copy()
996    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
997    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
998    if (dev.platform.name == "Portable Computing Language"
999            and cl.get_cl_header_version() < (1, 2)):
1000        sgs.finish()
1001    sorter = BitonicSort(ctx)
1002    sgs, evt = sorter(sgs, axis=1)
1003    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
1004
1005
1006@pytest.mark.parametrize("size", [
1007    0,
1008    4,
1009    2**14,
1010    2**18,
1011    ])
1012@pytest.mark.parametrize("dtype", [
1013    np.int32,
1014    np.float32,
1015    np.float64
1016    ])
1017@pytest.mark.bitonic
1018def test_bitonic_argsort(ctx_factory, size, dtype):
1019    import sys
1020    is_pypy = '__pypy__' in sys.builtin_module_names
1021
1022    if not size and is_pypy:
1023        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
1024        pytest.xfail("pypy doesn't seem to handle as_strided "
1025                "on zero-sized arrays very well")
1026
1027    ctx = cl.create_some_context()
1028    queue = cl.CommandQueue(ctx)
1029
1030    dev = ctx.devices[0]
1031    if (dev.platform.name == "Portable Computing Language"
1032            and sys.platform == "darwin"):
1033        pytest.xfail("Bitonic sort crashes on Apple POCL")
1034    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
1035        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
1036            "parallelism")
1037    if (dev.platform.name == "Portable Computing Language"
1038            and dtype == np.float64
1039            and get_pocl_version(dev.platform) < (1, 0)):
1040        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")
1041
1042    if dtype == np.float64 and not has_double_support(dev):
1043        from pytest import skip
1044        skip("double precision not supported on %s" % dev)
1045
1046    import pyopencl.clrandom as clrandom
1047    from pyopencl.bitonic_sort import BitonicSort
1048
1049    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
1050    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)
1051
1052    sorterm = BitonicSort(ctx)
1053
1054    ms = m.copy()
1055    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
1056    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
1057    if (dev.platform.name == "Portable Computing Language"
1058            and cl.get_cl_header_version() < (1, 2)):
1059        ms.finish()
1060        index.finish()
1061    ms, evt = sorterm(ms, idx=index, axis=0)
1062
1063    assert np.array_equal(np.sort(m.get()), ms.get())
1064
1065    # may be False because of identical values in array
1066    # assert np.array_equal(np.argsort(m.get()), index.get())
1067
1068    # Check values by indices
1069    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
1070
1071# }}}
1072
1073
1074if __name__ == "__main__":
1075    if len(sys.argv) > 1:
1076        exec(sys.argv[1])
1077    else:
1078        from pytest import main
1079        main([__file__])
1080
1081# vim: filetype=pyopencl:fdm=marker
1082