1#! /usr/bin/env python 2 3from __future__ import division, with_statement, absolute_import, print_function 4 5__copyright__ = "Copyright (C) 2013 Andreas Kloeckner" 6 7__license__ = """ 8Permission is hereby granted, free of charge, to any person obtaining a copy 9of this software and associated documentation files (the "Software"), to deal 10in the Software without restriction, including without limitation the rights 11to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12copies of the Software, and to permit persons to whom the Software is 13furnished to do so, subject to the following conditions: 14 15The above copyright notice and this permission notice shall be included in 16all copies or substantial portions of the Software. 17 18THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24THE SOFTWARE. 25""" 26 27from six.moves import range, zip 28import numpy as np 29import numpy.linalg as la 30import sys 31from pytools import memoize 32from test_array import general_clrand 33 34import pytest 35 36import pyopencl as cl 37import pyopencl.array as cl_array # noqa 38from pyopencl.tools import ( # noqa 39 pytest_generate_tests_for_pyopencl as pytest_generate_tests) 40from pyopencl.characterize import has_double_support, has_struct_arg_count_bug 41from pyopencl.scan import (InclusiveScanKernel, ExclusiveScanKernel, 42 GenericScanKernel, GenericDebugScanKernel) 43from pyopencl.characterize import get_pocl_version 44 45 46# {{{ elementwise 47 48def test_elwise_kernel(ctx_factory): 49 context = ctx_factory() 50 queue = cl.CommandQueue(context) 51 52 from pyopencl.clrandom import rand as clrand 53 54 a_gpu = clrand(queue, (50,), np.float32) 55 b_gpu = clrand(queue, (50,), np.float32) 56 57 from pyopencl.elementwise import ElementwiseKernel 58 lin_comb = ElementwiseKernel(context, 59 "float a, float *x, float b, float *y, float *z", 60 "z[i] = a*x[i] + b*y[i]", 61 "linear_combination") 62 63 c_gpu = cl_array.empty_like(a_gpu) 64 lin_comb(5, a_gpu, 6, b_gpu, c_gpu) 65 66 assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5 67 68 69def test_elwise_kernel_with_options(ctx_factory): 70 from pyopencl.clrandom import rand as clrand 71 from pyopencl.elementwise import ElementwiseKernel 72 73 context = ctx_factory() 74 queue = cl.CommandQueue(context) 75 76 in_gpu = clrand(queue, (50,), np.float32) 77 78 options = ['-D', 'ADD_ONE'] 79 add_one = ElementwiseKernel( 80 context, 81 "float* out, const float *in", 82 """ 83 out[i] = in[i] 84 #ifdef ADD_ONE 85 +1 86 #endif 87 ; 88 """, 89 options=options, 90 ) 91 92 out_gpu = cl_array.empty_like(in_gpu) 93 add_one(out_gpu, in_gpu) 94 95 gt = in_gpu.get() + 1 96 gv = out_gpu.get() 97 assert la.norm(gv - gt) < 1e-5 98 99 100def test_ranged_elwise_kernel(ctx_factory): 101 context = ctx_factory() 102 queue = cl.CommandQueue(context) 103 104 from pyopencl.elementwise import ElementwiseKernel 105 set_to_seven = ElementwiseKernel(context, 106 "float *z", "z[i] = 7", "set_to_seven") 107 108 for i, slc in enumerate([ 109 slice(5, 20000), 110 slice(5, 20000, 17), 111 slice(3000, 5, -1), 112 slice(1000, -1), 113 ]): 114 115 a_gpu = cl_array.zeros(queue, (50000,), dtype=np.float32) 116 a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype) 117 118 a_cpu[slc] = 7 119 set_to_seven(a_gpu, slice=slc) 120 121 assert (a_cpu == a_gpu.get()).all() 122 123 124def test_take(ctx_factory): 125 context = ctx_factory() 126 queue = cl.CommandQueue(context) 127 128 idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32) 129 a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32) 130 result = cl_array.take(a, idx) 131 assert ((3 * idx).get() == result.get()).all() 132 133 134def test_arange(ctx_factory): 135 context = ctx_factory() 136 queue = cl.CommandQueue(context) 137 138 n = 5000 139 a = cl_array.arange(queue, n, dtype=np.float32) 140 assert (np.arange(n, dtype=np.float32) == a.get()).all() 141 142 143def test_reverse(ctx_factory): 144 context = ctx_factory() 145 queue = cl.CommandQueue(context) 146 147 n = 5000 148 a = np.arange(n).astype(np.float32) 149 a_gpu = cl_array.to_device(queue, a) 150 151 a_gpu = a_gpu.reverse() 152 153 assert (a[::-1] == a_gpu.get()).all() 154 155 156def test_if_positive(ctx_factory): 157 context = ctx_factory() 158 queue = cl.CommandQueue(context) 159 160 from pyopencl.clrandom import rand as clrand 161 162 ary_len = 20000 163 a_gpu = clrand(queue, (ary_len,), np.float32) 164 b_gpu = clrand(queue, (ary_len,), np.float32) 165 a = a_gpu.get() 166 b = b_gpu.get() 167 168 max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu) 169 min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu) 170 171 print(max_a_b_gpu) 172 print(np.maximum(a, b)) 173 174 assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 175 assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0 176 177 178def test_take_put(ctx_factory): 179 context = ctx_factory() 180 queue = cl.CommandQueue(context) 181 182 for n in [5, 17, 333]: 183 one_field_size = 8 184 buf_gpu = cl_array.zeros(queue, 185 n * one_field_size, dtype=np.float32) 186 dest_indices = cl_array.to_device(queue, 187 np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32)) 188 read_map = cl_array.to_device(queue, 189 np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32)) 190 191 cl_array.multi_take_put( 192 arrays=[buf_gpu for i in range(n)], 193 dest_indices=dest_indices, 194 src_indices=read_map, 195 src_offsets=[i * one_field_size for i in range(n)], 196 dest_shape=(96,)) 197 198 199def test_astype(ctx_factory): 200 context = ctx_factory() 201 queue = cl.CommandQueue(context) 202 203 from pyopencl.clrandom import rand as clrand 204 205 if not has_double_support(context.devices[0]): 206 from pytest import skip 207 skip("double precision not supported on %s" % context.devices[0]) 208 209 a_gpu = clrand(queue, (2000,), dtype=np.float32) 210 211 a = a_gpu.get().astype(np.float64) 212 a2 = a_gpu.astype(np.float64).get() 213 214 assert a2.dtype == np.float64 215 assert la.norm(a - a2) == 0, (a, a2) 216 217 a_gpu = clrand(queue, (2000,), dtype=np.float64) 218 219 a = a_gpu.get().astype(np.float32) 220 a2 = a_gpu.astype(np.float32).get() 221 222 assert a2.dtype == np.float32 223 assert la.norm(a - a2) / la.norm(a) < 1e-7 224 225# }}} 226 227 228# {{{ reduction 229 230def test_sum(ctx_factory): 231 from pytest import importorskip 232 importorskip("mako") 233 234 context = ctx_factory() 235 queue = cl.CommandQueue(context) 236 237 n = 200000 238 for dtype in [np.float32, np.complex64]: 239 a_gpu = general_clrand(queue, (n,), dtype) 240 241 a = a_gpu.get() 242 243 for slc in [ 244 slice(None), 245 slice(1000, 3000), 246 slice(1000, -3000), 247 slice(1000, None), 248 slice(1000, None, 3), 249 ]: 250 sum_a = np.sum(a[slc]) 251 252 if slc.step is None: 253 sum_a_gpu = cl_array.sum(a_gpu[slc]).get() 254 assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4 255 256 sum_a_gpu_2 = cl_array.sum(a_gpu, slice=slc).get() 257 assert abs(sum_a_gpu_2 - sum_a) / abs(sum_a) < 1e-4 258 259 260def test_sum_without_data(ctx_factory): 261 from pytest import importorskip 262 importorskip("mako") 263 264 context = ctx_factory() 265 queue = cl.CommandQueue(context) 266 267 n = 2000 268 269 from pyopencl.reduction import ReductionKernel 270 red = ReductionKernel(context, np.int32, 271 neutral="0", 272 reduce_expr="a+b", map_expr="i", 273 arguments=[]) 274 275 result_dev = red(range=slice(n), queue=queue).get() 276 result_ref = n*(n-1)//2 277 278 assert result_dev == result_ref 279 280 281def test_minmax(ctx_factory): 282 from pytest import importorskip 283 importorskip("mako") 284 285 context = ctx_factory() 286 queue = cl.CommandQueue(context) 287 288 from pyopencl.clrandom import rand as clrand 289 290 if has_double_support(context.devices[0]): 291 dtypes = [np.float64, np.float32, np.int32] 292 else: 293 dtypes = [np.float32, np.int32] 294 295 for what in ["min", "max"]: 296 for dtype in dtypes: 297 a_gpu = clrand(queue, (200000,), dtype) 298 a = a_gpu.get() 299 300 op_a = getattr(np, what)(a) 301 op_a_gpu = getattr(cl_array, what)(a_gpu).get() 302 303 assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what) 304 305 306def test_subset_minmax(ctx_factory): 307 from pytest import importorskip 308 importorskip("mako") 309 310 context = ctx_factory() 311 queue = cl.CommandQueue(context) 312 313 from pyopencl.clrandom import rand as clrand 314 315 l_a = 200000 316 gran = 5 317 l_m = l_a - l_a // gran + 1 318 319 if has_double_support(context.devices[0]): 320 dtypes = [np.float64, np.float32, np.int32] 321 else: 322 dtypes = [np.float32, np.int32] 323 324 for dtype in dtypes: 325 a_gpu = clrand(queue, (l_a,), dtype) 326 a = a_gpu.get() 327 328 meaningful_indices_gpu = cl_array.zeros( 329 queue, l_m, dtype=np.int32) 330 meaningful_indices = meaningful_indices_gpu.get() 331 j = 0 332 for i in range(len(meaningful_indices)): 333 meaningful_indices[i] = j 334 j = j + 1 335 if j % gran == 0: 336 j = j + 1 337 338 meaningful_indices_gpu = cl_array.to_device( 339 queue, meaningful_indices) 340 b = a[meaningful_indices] 341 342 min_a = np.min(b) 343 min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get() 344 345 assert min_a_gpu == min_a 346 347 348def test_dot(ctx_factory): 349 from pytest import importorskip 350 importorskip("mako") 351 352 context = ctx_factory() 353 queue = cl.CommandQueue(context) 354 355 dev = context.devices[0] 356 357 dtypes = [np.float32, np.complex64] 358 if has_double_support(dev): 359 if has_struct_arg_count_bug(dev) == "apple": 360 dtypes.extend([np.float64]) 361 else: 362 dtypes.extend([np.float64, np.complex128]) 363 364 for a_dtype in dtypes: 365 for b_dtype in dtypes: 366 print(a_dtype, b_dtype) 367 a_gpu = general_clrand(queue, (200000,), a_dtype) 368 a = a_gpu.get() 369 b_gpu = general_clrand(queue, (200000,), b_dtype) 370 b = b_gpu.get() 371 372 dot_ab = np.dot(a, b) 373 dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() 374 375 assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 376 377 try: 378 vdot_ab = np.vdot(a, b) 379 except NotImplementedError: 380 import sys 381 is_pypy = '__pypy__' in sys.builtin_module_names 382 if is_pypy: 383 print("PYPY: VDOT UNIMPLEMENTED") 384 continue 385 else: 386 raise 387 388 vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() 389 390 rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) 391 assert rel_err < 1e-4, rel_err 392 393 394@memoize 395def make_mmc_dtype(device): 396 dtype = np.dtype([ 397 ("cur_min", np.int32), 398 ("cur_max", np.int32), 399 ("pad", np.int32), 400 ]) 401 402 name = "minmax_collector" 403 from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct 404 405 dtype, c_decl = match_dtype_to_c_struct(device, name, dtype) 406 dtype = get_or_register_dtype(name, dtype) 407 408 return dtype, c_decl 409 410 411def test_struct_reduce(ctx_factory): 412 pytest.importorskip("mako") 413 414 context = ctx_factory() 415 queue = cl.CommandQueue(context) 416 417 dev, = context.devices 418 if (dev.vendor == "NVIDIA" and dev.platform.vendor == "Apple" 419 and dev.driver_version == "8.12.47 310.40.00.05f01"): 420 pytest.skip("causes a compiler hang on Apple/Nv GPU") 421 422 mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0]) 423 424 preamble = mmc_c_decl + r"""//CL// 425 426 minmax_collector mmc_neutral() 427 { 428 // FIXME: needs infinity literal in real use, ok here 429 minmax_collector result; 430 result.cur_min = 1<<30; 431 result.cur_max = -(1<<30); 432 return result; 433 } 434 435 minmax_collector mmc_from_scalar(float x) 436 { 437 minmax_collector result; 438 result.cur_min = x; 439 result.cur_max = x; 440 return result; 441 } 442 443 minmax_collector agg_mmc(minmax_collector a, minmax_collector b) 444 { 445 minmax_collector result = a; 446 if (b.cur_min < result.cur_min) 447 result.cur_min = b.cur_min; 448 if (b.cur_max > result.cur_max) 449 result.cur_max = b.cur_max; 450 return result; 451 } 452 453 """ 454 455 from pyopencl.clrandom import rand as clrand 456 a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) 457 a = a_gpu.get() 458 459 from pyopencl.reduction import ReductionKernel 460 red = ReductionKernel(context, mmc_dtype, 461 neutral="mmc_neutral()", 462 reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", 463 arguments="__global int *x", preamble=preamble) 464 465 minmax = red(a_gpu).get() 466 #print minmax["cur_min"], minmax["cur_max"] 467 #print np.min(a), np.max(a) 468 469 assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 470 assert abs(minmax["cur_max"] - np.max(a)) < 1e-5 471 472# }}} 473 474 475# {{{ scan-related 476 477def summarize_error(obtained, desired, orig, thresh=1e-5): 478 from pytest import importorskip 479 importorskip("mako") 480 481 err = obtained - desired 482 ok_count = 0 483 bad_count = 0 484 485 bad_limit = 200 486 487 def summarize_counts(): 488 if ok_count: 489 entries.append("<%d ok>" % ok_count) 490 if bad_count >= bad_limit: 491 entries.append("<%d more bad>" % (bad_count-bad_limit)) 492 493 entries = [] 494 for i, val in enumerate(err): 495 if abs(val) > thresh: 496 if ok_count: 497 summarize_counts() 498 ok_count = 0 499 500 bad_count += 1 501 502 if bad_count < bad_limit: 503 entries.append("%r (want: %r, got: %r, orig: %r)" % ( 504 obtained[i], desired[i], obtained[i], orig[i])) 505 else: 506 if bad_count: 507 summarize_counts() 508 bad_count = 0 509 510 ok_count += 1 511 512 summarize_counts() 513 514 return " ".join(entries) 515 516 517scan_test_counts = [ 518 10, 519 2 ** 8 - 1, 520 2 ** 8, 521 2 ** 8 + 1, 522 2 ** 10 - 5, 523 2 ** 10, 524 2 ** 10 + 5, 525 2 ** 12 - 5, 526 2 ** 12, 527 2 ** 12 + 5, 528 2 ** 20 - 2 ** 18, 529 2 ** 20 - 2 ** 18 + 5, 530 2 ** 20 + 1, 531 2 ** 20, 532 2 ** 23 + 3, 533 # larger sizes cause out of memory on low-end AMD APUs 534 ] 535 536 537@pytest.mark.parametrize("dtype", [np.int32, np.int64]) 538@pytest.mark.parametrize("scan_cls", [InclusiveScanKernel, ExclusiveScanKernel]) 539def test_scan(ctx_factory, dtype, scan_cls): 540 from pytest import importorskip 541 importorskip("mako") 542 543 context = ctx_factory() 544 queue = cl.CommandQueue(context) 545 546 knl = scan_cls(context, dtype, "a+b", "0") 547 548 for n in scan_test_counts: 549 host_data = np.random.randint(0, 10, n).astype(dtype) 550 dev_data = cl_array.to_device(queue, host_data) 551 552 # /!\ fails on Nv GT2?? for some drivers 553 assert (host_data == dev_data.get()).all() 554 555 knl(dev_data) 556 557 desired_result = np.cumsum(host_data, axis=0) 558 if scan_cls is ExclusiveScanKernel: 559 desired_result -= host_data 560 561 is_ok = (dev_data.get() == desired_result).all() 562 if 1 and not is_ok: 563 print("something went wrong, summarizing error...") 564 print(summarize_error(dev_data.get(), desired_result, host_data)) 565 566 print("dtype:%s n:%d %s worked:%s" % (dtype, n, scan_cls, is_ok)) 567 assert is_ok 568 from gc import collect 569 collect() 570 571 572@pytest.mark.parametrize("scan_cls", (GenericScanKernel, GenericDebugScanKernel)) 573def test_scan_with_vectorargs_with_offsets(ctx_factory, scan_cls): 574 context = ctx_factory() 575 queue = cl.CommandQueue(context) 576 577 from pyopencl.tools import VectorArg 578 579 knl = scan_cls( 580 context, float, 581 arguments=[ 582 VectorArg(float, "input", with_offset=True), 583 VectorArg(int, "segment", with_offset=True), 584 ], 585 input_expr="input[i]", 586 is_segment_start_expr="segment[i]", 587 scan_expr="a+b", neutral="0", 588 output_statement=""" 589 input[i] = item; 590 """) 591 592 n = 20 593 594 host_data = np.random.randint(0, 10, n).astype(float) 595 dev_data = cl.array.to_device(queue, host_data) 596 segment_data = np.zeros(n, dtype=int) 597 dev_segment_data = cl.array.to_device(queue, segment_data) 598 599 knl(dev_data, dev_segment_data) 600 601 assert (dev_data.get() == np.cumsum(host_data)).all() 602 603 604def test_copy_if(ctx_factory): 605 from pytest import importorskip 606 importorskip("mako") 607 608 context = ctx_factory() 609 queue = cl.CommandQueue(context) 610 611 from pyopencl.clrandom import rand as clrand 612 for n in scan_test_counts: 613 a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) 614 a = a_dev.get() 615 616 from pyopencl.algorithm import copy_if 617 618 crit = a_dev.dtype.type(300) 619 selected = a[a > crit] 620 selected_dev, count_dev, evt = copy_if( 621 a_dev, "ary[i] > myval", [("myval", crit)]) 622 623 assert (selected_dev.get()[:count_dev.get()] == selected).all() 624 from gc import collect 625 collect() 626 627 628def test_partition(ctx_factory): 629 from pytest import importorskip 630 importorskip("mako") 631 632 context = ctx_factory() 633 queue = cl.CommandQueue(context) 634 635 from pyopencl.clrandom import rand as clrand 636 for n in scan_test_counts: 637 print("part", n) 638 639 a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) 640 a = a_dev.get() 641 642 crit = a_dev.dtype.type(300) 643 true_host = a[a > crit] 644 false_host = a[a <= crit] 645 646 from pyopencl.algorithm import partition 647 true_dev, false_dev, count_true_dev, evt = partition( 648 a_dev, "ary[i] > myval", [("myval", crit)]) 649 650 count_true_dev = count_true_dev.get() 651 652 assert (true_dev.get()[:count_true_dev] == true_host).all() 653 assert (false_dev.get()[:n-count_true_dev] == false_host).all() 654 655 656def test_unique(ctx_factory): 657 from pytest import importorskip 658 importorskip("mako") 659 660 context = ctx_factory() 661 queue = cl.CommandQueue(context) 662 663 from pyopencl.clrandom import rand as clrand 664 for n in scan_test_counts: 665 a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) 666 a = a_dev.get() 667 a = np.sort(a) 668 a_dev = cl_array.to_device(queue, a) 669 670 a_unique_host = np.unique(a) 671 672 from pyopencl.algorithm import unique 673 a_unique_dev, count_unique_dev, evt = unique(a_dev) 674 675 count_unique_dev = count_unique_dev.get() 676 677 assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all() 678 from gc import collect 679 collect() 680 681 682def test_index_preservation(ctx_factory): 683 from pytest import importorskip 684 importorskip("mako") 685 686 context = ctx_factory() 687 queue = cl.CommandQueue(context) 688 689 classes = [GenericScanKernel] 690 691 dev = context.devices[0] 692 if dev.type & cl.device_type.CPU: 693 classes.append(GenericDebugScanKernel) 694 695 for cls in classes: 696 for n in scan_test_counts: 697 knl = cls( 698 context, np.int32, 699 arguments="__global int *out", 700 input_expr="i", 701 scan_expr="b", neutral="0", 702 output_statement=""" 703 out[i] = item; 704 """) 705 706 out = cl_array.empty(queue, n, dtype=np.int32) 707 knl(out) 708 709 assert (out.get() == np.arange(n)).all() 710 from gc import collect 711 collect() 712 713 714def test_segmented_scan(ctx_factory): 715 from pytest import importorskip 716 importorskip("mako") 717 718 context = ctx_factory() 719 queue = cl.CommandQueue(context) 720 721 from pyopencl.tools import dtype_to_ctype 722 dtype = np.int32 723 ctype = dtype_to_ctype(dtype) 724 725 #for is_exclusive in [False, True]: 726 for is_exclusive in [True, False]: 727 if is_exclusive: 728 output_statement = "out[i] = prev_item" 729 else: 730 output_statement = "out[i] = item" 731 732 knl = GenericScanKernel(context, dtype, 733 arguments="__global %s *ary, __global char *segflags, " 734 "__global %s *out" % (ctype, ctype), 735 input_expr="ary[i]", 736 scan_expr="across_seg_boundary ? b : (a+b)", neutral="0", 737 is_segment_start_expr="segflags[i]", 738 output_statement=output_statement, 739 options=[]) 740 741 np.set_printoptions(threshold=2000) 742 from random import randrange 743 from pyopencl.clrandom import rand as clrand 744 for n in scan_test_counts: 745 a_dev = clrand(queue, (n,), dtype=dtype, a=0, b=10) 746 a = a_dev.get() 747 748 if 10 <= n < 20: 749 seg_boundaries_values = [ 750 [0, 9], 751 [0, 3], 752 [4, 6], 753 ] 754 else: 755 seg_boundaries_values = [] 756 for i in range(10): 757 seg_boundary_count = max(2, min(100, randrange(0, int(0.4*n)))) 758 seg_boundaries = [ 759 randrange(n) for i in range(seg_boundary_count)] 760 if n >= 1029: 761 seg_boundaries.insert(0, 1028) 762 seg_boundaries.sort() 763 seg_boundaries_values.append(seg_boundaries) 764 765 for seg_boundaries in seg_boundaries_values: 766 #print "BOUNDARIES", seg_boundaries 767 #print a 768 769 seg_boundary_flags = np.zeros(n, dtype=np.uint8) 770 seg_boundary_flags[seg_boundaries] = 1 771 seg_boundary_flags_dev = cl_array.to_device( 772 queue, seg_boundary_flags) 773 774 seg_boundaries.insert(0, 0) 775 776 result_host = a.copy() 777 for i, seg_start in enumerate(seg_boundaries): 778 if i+1 < len(seg_boundaries): 779 seg_end = seg_boundaries[i+1] 780 else: 781 seg_end = None 782 783 if is_exclusive: 784 result_host[seg_start+1:seg_end] = np.cumsum( 785 a[seg_start:seg_end][:-1]) 786 result_host[seg_start] = 0 787 else: 788 result_host[seg_start:seg_end] = np.cumsum( 789 a[seg_start:seg_end]) 790 791 #print "REF", result_host 792 793 result_dev = cl_array.empty_like(a_dev) 794 knl(a_dev, seg_boundary_flags_dev, result_dev) 795 796 #print "RES", result_dev 797 is_correct = (result_dev.get() == result_host).all() 798 if not is_correct: 799 diff = result_dev.get() - result_host 800 print("RES-REF", diff) 801 print("ERRWHERE", np.where(diff)) 802 print(n, list(seg_boundaries)) 803 804 assert is_correct 805 from gc import collect 806 collect() 807 808 print("%d excl:%s done" % (n, is_exclusive)) 809 810 811@pytest.mark.parametrize("scan_kernel", [GenericScanKernel, GenericDebugScanKernel]) 812def test_sort(ctx_factory, scan_kernel): 813 from pytest import importorskip 814 importorskip("mako") 815 816 context = ctx_factory() 817 queue = cl.CommandQueue(context) 818 819 dtype = np.int32 820 821 from pyopencl.algorithm import RadixSort 822 sort = RadixSort(context, "int *ary", key_expr="ary[i]", 823 sort_arg_names=["ary"], scan_kernel=scan_kernel) 824 825 from pyopencl.clrandom import RanluxGenerator 826 rng = RanluxGenerator(queue, seed=15) 827 828 from time import time 829 830 # intermediate arrays for largest size cause out-of-memory on low-end GPUs 831 for n in scan_test_counts[:-1]: 832 if n >= 2000 and isinstance(scan_kernel, GenericDebugScanKernel): 833 continue 834 835 print(n) 836 837 print(" rng") 838 a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16) 839 a = a_dev.get() 840 841 dev_start = time() 842 print(" device") 843 (a_dev_sorted,), evt = sort(a_dev, key_bits=16) 844 queue.finish() 845 dev_end = time() 846 print(" numpy") 847 a_sorted = np.sort(a) 848 numpy_end = time() 849 850 numpy_elapsed = numpy_end-dev_end 851 dev_elapsed = dev_end-dev_start 852 print(" dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % ( 853 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed)) 854 assert (a_dev_sorted.get() == a_sorted).all() 855 856 857def test_list_builder(ctx_factory): 858 from pytest import importorskip 859 importorskip("mako") 860 861 context = ctx_factory() 862 queue = cl.CommandQueue(context) 863 864 from pyopencl.algorithm import ListOfListsBuilder 865 builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL// 866 void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) 867 { 868 int count = i % 4; 869 for (int j = 0; j < count; ++j) 870 { 871 APPEND_mylist(count); 872 } 873 } 874 """, arg_decls=[]) 875 876 result, evt = builder(queue, 2000) 877 878 inf = result["mylist"] 879 assert inf.count == 3000 880 assert (inf.lists.get()[-6:] == [1, 2, 2, 3, 3, 3]).all() 881 882 883def test_list_builder_with_empty_elim(ctx_factory): 884 from pytest import importorskip 885 importorskip("mako") 886 887 context = ctx_factory() 888 queue = cl.CommandQueue(context) 889 890 from pyopencl.algorithm import ListOfListsBuilder 891 892 builder = ListOfListsBuilder( 893 context, 894 [("mylist1", np.int32), ("mylist2", np.int32), ("mylist3", np.int32)], 895 """//CL// 896 void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) 897 { 898 if (i % 5 == 0) 899 { 900 for (int j = 0; j < i / 5; ++j) 901 { 902 APPEND_mylist1(j); 903 APPEND_mylist2(j + 1); 904 APPEND_mylist3(j); 905 } 906 } 907 } 908 """, 909 arg_decls=[], 910 eliminate_empty_output_lists=["mylist1", "mylist2"]) 911 912 result, evt = builder(queue, 1000) 913 914 mylist1 = result["mylist1"] 915 assert mylist1.count == 19900 916 assert (mylist1.starts.get()[:5] == [0, 1, 3, 6, 10]).all() 917 assert (mylist1.nonempty_indices.get()[:5] == [5, 10, 15, 20, 25]).all() 918 assert (mylist1.lists.get()[:6] == [0, 0, 1, 0, 1, 2]).all() 919 mylist2 = result["mylist2"] 920 assert mylist2.count == 19900 921 assert (mylist2.lists.get()[:6] == [1, 1, 2, 1, 2, 3]).all() 922 mylist3 = result["mylist3"] 923 assert mylist3.count == 19900 924 assert (mylist3.starts.get()[:10] == [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]).all() 925 assert (mylist3.lists.get()[:6] == [0, 0, 1, 0, 1, 2]).all() 926 927 928def test_key_value_sorter(ctx_factory): 929 from pytest import importorskip 930 importorskip("mako") 931 932 context = ctx_factory() 933 queue = cl.CommandQueue(context) 934 935 n = 10**5 936 nkeys = 2000 937 from pyopencl.clrandom import rand as clrand 938 keys = clrand(queue, n, np.int32, b=nkeys) 939 values = clrand(queue, n, np.int32, b=n).astype(np.int64) 940 941 assert np.max(keys.get()) < nkeys 942 943 from pyopencl.algorithm import KeyValueSorter 944 kvs = KeyValueSorter(context) 945 starts, lists, evt = kvs(queue, keys, values, nkeys, starts_dtype=np.int32) 946 947 starts = starts.get() 948 lists = lists.get() 949 950 mydict = dict() 951 for k, v in zip(keys.get(), values.get()): 952 mydict.setdefault(k, []).append(v) 953 954 for i in range(nkeys): 955 start, end = starts[i:i+2] 956 assert sorted(mydict[i]) == sorted(lists[start:end]) 957 958# }}} 959 960 961# {{{ bitonic sort 962 963@pytest.mark.parametrize("size", [ 964 512, 965 4, 966 16 967 ]) 968@pytest.mark.parametrize("dtype", [ 969 np.int32, 970 np.float32, 971 np.float64 972 ]) 973@pytest.mark.bitonic 974def test_bitonic_sort(ctx_factory, size, dtype): 975 ctx = cl.create_some_context() 976 queue = cl.CommandQueue(ctx) 977 978 dev = ctx.devices[0] 979 if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): 980 pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " 981 "parallelism") 982 if (dev.platform.name == "Portable Computing Language" 983 and dtype == np.float64 984 and get_pocl_version(dev.platform) < (1, 0)): 985 pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") 986 987 if dtype == np.float64 and not has_double_support(dev): 988 from pytest import skip 989 skip("double precision not supported on %s" % dev) 990 991 import pyopencl.clrandom as clrandom 992 from pyopencl.bitonic_sort import BitonicSort 993 994 s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) 995 sgs = s.copy() 996 # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for 997 # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 998 if (dev.platform.name == "Portable Computing Language" 999 and cl.get_cl_header_version() < (1, 2)): 1000 sgs.finish() 1001 sorter = BitonicSort(ctx) 1002 sgs, evt = sorter(sgs, axis=1) 1003 assert np.array_equal(np.sort(s.get(), axis=1), sgs.get()) 1004 1005 1006@pytest.mark.parametrize("size", [ 1007 0, 1008 4, 1009 2**14, 1010 2**18, 1011 ]) 1012@pytest.mark.parametrize("dtype", [ 1013 np.int32, 1014 np.float32, 1015 np.float64 1016 ]) 1017@pytest.mark.bitonic 1018def test_bitonic_argsort(ctx_factory, size, dtype): 1019 import sys 1020 is_pypy = '__pypy__' in sys.builtin_module_names 1021 1022 if not size and is_pypy: 1023 # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array 1024 pytest.xfail("pypy doesn't seem to handle as_strided " 1025 "on zero-sized arrays very well") 1026 1027 ctx = cl.create_some_context() 1028 queue = cl.CommandQueue(ctx) 1029 1030 dev = ctx.devices[0] 1031 if (dev.platform.name == "Portable Computing Language" 1032 and sys.platform == "darwin"): 1033 pytest.xfail("Bitonic sort crashes on Apple POCL") 1034 if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): 1035 pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " 1036 "parallelism") 1037 if (dev.platform.name == "Portable Computing Language" 1038 and dtype == np.float64 1039 and get_pocl_version(dev.platform) < (1, 0)): 1040 pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") 1041 1042 if dtype == np.float64 and not has_double_support(dev): 1043 from pytest import skip 1044 skip("double precision not supported on %s" % dev) 1045 1046 import pyopencl.clrandom as clrandom 1047 from pyopencl.bitonic_sort import BitonicSort 1048 1049 index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) 1050 m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) 1051 1052 sorterm = BitonicSort(ctx) 1053 1054 ms = m.copy() 1055 # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for 1056 # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 1057 if (dev.platform.name == "Portable Computing Language" 1058 and cl.get_cl_header_version() < (1, 2)): 1059 ms.finish() 1060 index.finish() 1061 ms, evt = sorterm(ms, idx=index, axis=0) 1062 1063 assert np.array_equal(np.sort(m.get()), ms.get()) 1064 1065 # may be False because of identical values in array 1066 # assert np.array_equal(np.argsort(m.get()), index.get()) 1067 1068 # Check values by indices 1069 assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()]) 1070 1071# }}} 1072 1073 1074if __name__ == "__main__": 1075 if len(sys.argv) > 1: 1076 exec(sys.argv[1]) 1077 else: 1078 from pytest import main 1079 main([__file__]) 1080 1081# vim: filetype=pyopencl:fdm=marker 1082