1#! /usr/bin/env python 2from __future__ import division, with_statement, absolute_import, print_function 3 4__copyright__ = "Copyright (C) 2009 Andreas Kloeckner" 5 6__license__ = """ 7Permission is hereby granted, free of charge, to any person obtaining a copy 8of this software and associated documentation files (the "Software"), to deal 9in the Software without restriction, including without limitation the rights 10to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11copies of the Software, and to permit persons to whom the Software is 12furnished to do so, subject to the following conditions: 13 14The above copyright notice and this permission notice shall be included in 15all copies or substantial portions of the Software. 16 17THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23THE SOFTWARE. 24""" 25 26import numpy as np 27import numpy.linalg as la 28import sys 29 30from six.moves import range 31import pytest 32 33import pyopencl as cl 34import pyopencl.array as cl_array 35import pyopencl.cltypes as cltypes 36import pyopencl.tools as cl_tools 37from pyopencl.tools import ( # noqa 38 pytest_generate_tests_for_pyopencl as pytest_generate_tests) 39from pyopencl.characterize import has_double_support, has_struct_arg_count_bug 40 41from pyopencl.clrandom import RanluxGenerator, PhiloxGenerator, ThreefryGenerator 42 43_PYPY = cl._PYPY 44 45 46# {{{ helpers 47 48TO_REAL = { 49 np.dtype(np.complex64): np.float32, 50 np.dtype(np.complex128): np.float64 51 } 52 53 54def general_clrand(queue, shape, dtype): 55 from pyopencl.clrandom import rand as clrand 56 57 dtype = np.dtype(dtype) 58 if dtype.kind == "c": 59 real_dtype = dtype.type(0).real.dtype 60 return clrand(queue, shape, real_dtype) + 1j*clrand(queue, shape, real_dtype) 61 else: 62 return clrand(queue, shape, dtype) 63 64 65def make_random_array(queue, dtype, size): 66 from pyopencl.clrandom import rand 67 68 dtype = np.dtype(dtype) 69 if dtype.kind == "c": 70 real_dtype = TO_REAL[dtype] 71 return (rand(queue, shape=(size,), dtype=real_dtype).astype(dtype) 72 + rand(queue, shape=(size,), dtype=real_dtype).astype(dtype) 73 * dtype.type(1j)) 74 else: 75 return rand(queue, shape=(size,), dtype=dtype) 76 77# }}} 78 79 80# {{{ dtype-related 81 82def test_basic_complex(ctx_factory): 83 context = ctx_factory() 84 queue = cl.CommandQueue(context) 85 86 from pyopencl.clrandom import rand 87 88 size = 500 89 90 ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) 91 + rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) * 1j) 92 c = np.complex64(5+7j) 93 94 host_ary = ary.get() 95 assert la.norm((ary*c).get() - c*host_ary) < 1e-5 * la.norm(host_ary) 96 97 98def test_mix_complex(ctx_factory): 99 context = ctx_factory() 100 queue = cl.CommandQueue(context) 101 102 size = 10 103 104 dtypes = [ 105 (np.float32, np.complex64), 106 #(np.int32, np.complex64), 107 ] 108 109 dev = context.devices[0] 110 if has_double_support(dev) and has_struct_arg_count_bug(dev) == "apple": 111 dtypes.extend([ 112 (np.float32, np.float64), 113 ]) 114 elif has_double_support(dev): 115 dtypes.extend([ 116 (np.float32, np.float64), 117 (np.float32, np.complex128), 118 (np.float64, np.complex64), 119 (np.float64, np.complex128), 120 ]) 121 122 from operator import add, mul, sub, truediv 123 for op in [add, sub, mul, truediv, pow]: 124 for dtype_a0, dtype_b0 in dtypes: 125 for dtype_a, dtype_b in [ 126 (dtype_a0, dtype_b0), 127 (dtype_b0, dtype_a0), 128 ]: 129 for is_scalar_a, is_scalar_b in [ 130 (False, False), 131 (False, True), 132 (True, False), 133 ]: 134 if is_scalar_a: 135 ary_a = make_random_array(queue, dtype_a, 1).get()[0] 136 host_ary_a = ary_a 137 else: 138 ary_a = make_random_array(queue, dtype_a, size) 139 host_ary_a = ary_a.get() 140 141 if is_scalar_b: 142 ary_b = make_random_array(queue, dtype_b, 1).get()[0] 143 host_ary_b = ary_b 144 else: 145 ary_b = make_random_array(queue, dtype_b, size) 146 host_ary_b = ary_b.get() 147 148 print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b) 149 dev_result = op(ary_a, ary_b).get() 150 host_result = op(host_ary_a, host_ary_b) 151 152 if host_result.dtype != dev_result.dtype: 153 # This appears to be a numpy bug, where we get 154 # served a Python complex that is really a 155 # smaller numpy complex. 156 157 print("HOST_DTYPE: %s DEV_DTYPE: %s" % ( 158 host_result.dtype, dev_result.dtype)) 159 160 dev_result = dev_result.astype(host_result.dtype) 161 162 err = la.norm(host_result-dev_result)/la.norm(host_result) 163 print(err) 164 correct = err < 1e-4 165 if not correct: 166 print(host_result) 167 print(dev_result) 168 print(host_result - dev_result) 169 170 assert correct 171 172 173def test_pow_neg1_vs_inv(ctx_factory): 174 ctx = ctx_factory() 175 queue = cl.CommandQueue(ctx) 176 177 device = ctx.devices[0] 178 if not has_double_support(device): 179 from pytest import skip 180 skip("double precision not supported on %s" % device) 181 if has_struct_arg_count_bug(device) == "apple": 182 from pytest import xfail 183 xfail("apple struct arg counting broken") 184 185 a_dev = make_random_array(queue, np.complex128, 20000) 186 187 res1 = (a_dev ** (-1)).get() 188 res2 = (1/a_dev).get() 189 ref = 1/a_dev.get() 190 191 assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13 192 assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13 193 194 195def test_vector_fill(ctx_factory): 196 context = ctx_factory() 197 queue = cl.CommandQueue(context) 198 199 a_gpu = cl_array.Array(queue, 100, dtype=cltypes.float4) 200 a_gpu.fill(cltypes.make_float4(0.0, 0.0, 1.0, 0.0)) 201 a = a_gpu.get() 202 assert a.dtype == cltypes.float4 203 204 a_gpu = cl_array.zeros(queue, 100, dtype=cltypes.float4) 205 206 207def test_absrealimag(ctx_factory): 208 context = ctx_factory() 209 queue = cl.CommandQueue(context) 210 211 def real(x): 212 return x.real 213 214 def imag(x): 215 return x.imag 216 217 def conj(x): 218 return x.conj() 219 220 n = 111 221 for func in [abs, real, imag, conj]: 222 for dtype in [np.int32, np.float32, np.complex64]: 223 print(func, dtype) 224 a = -make_random_array(queue, dtype, n) 225 226 host_res = func(a.get()) 227 dev_res = func(a).get() 228 229 correct = np.allclose(dev_res, host_res) 230 if not correct: 231 print(dev_res) 232 print(host_res) 233 print(dev_res-host_res) 234 assert correct 235 236 237def test_custom_type_zeros(ctx_factory): 238 context = ctx_factory() 239 queue = cl.CommandQueue(context) 240 241 if not ( 242 queue._get_cl_version() >= (1, 2) 243 and cl.get_cl_header_version() >= (1, 2)): 244 pytest.skip("CL1.2 not available") 245 246 dtype = np.dtype([ 247 ("cur_min", np.int32), 248 ("cur_max", np.int32), 249 ("pad", np.int32), 250 ]) 251 252 from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct 253 254 name = "mmc_type" 255 dtype, c_decl = match_dtype_to_c_struct(queue.device, name, dtype) 256 dtype = get_or_register_dtype(name, dtype) 257 258 n = 1000 259 z_dev = cl.array.zeros(queue, n, dtype=dtype) 260 261 z = z_dev.get() 262 263 assert np.array_equal(np.zeros(n, dtype), z) 264 265 266def test_custom_type_fill(ctx_factory): 267 context = ctx_factory() 268 queue = cl.CommandQueue(context) 269 270 from pyopencl.characterize import has_struct_arg_count_bug 271 if has_struct_arg_count_bug(queue.device): 272 pytest.skip("device has LLVM arg counting bug") 273 274 dtype = np.dtype([ 275 ("cur_min", np.int32), 276 ("cur_max", np.int32), 277 ("pad", np.int32), 278 ]) 279 280 from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct 281 282 name = "mmc_type" 283 dtype, c_decl = match_dtype_to_c_struct(queue.device, name, dtype) 284 dtype = get_or_register_dtype(name, dtype) 285 286 n = 1000 287 z_dev = cl.array.empty(queue, n, dtype=dtype) 288 z_dev.fill(np.zeros((), dtype)) 289 290 z = z_dev.get() 291 292 assert np.array_equal(np.zeros(n, dtype), z) 293 294 295def test_custom_type_take_put(ctx_factory): 296 context = ctx_factory() 297 queue = cl.CommandQueue(context) 298 299 dtype = np.dtype([ 300 ("cur_min", np.int32), 301 ("cur_max", np.int32), 302 ]) 303 304 from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct 305 306 name = "tp_type" 307 dtype, c_decl = match_dtype_to_c_struct(queue.device, name, dtype) 308 dtype = get_or_register_dtype(name, dtype) 309 310 n = 100 311 z = np.empty(100, dtype) 312 z["cur_min"] = np.arange(n) 313 z["cur_max"] = np.arange(n)**2 314 315 z_dev = cl.array.to_device(queue, z) 316 ind = cl.array.arange(queue, n, step=3, dtype=np.int32) 317 318 z_ind_ref = z[ind.get()] 319 z_ind = z_dev[ind] 320 321 assert np.array_equal(z_ind.get(), z_ind_ref) 322 323# }}} 324 325 326# {{{ operators 327 328def test_rmul_yields_right_type(ctx_factory): 329 context = ctx_factory() 330 queue = cl.CommandQueue(context) 331 332 a = np.array([1, 2, 3, 4, 5]).astype(np.float32) 333 a_gpu = cl_array.to_device(queue, a) 334 335 two_a = 2*a_gpu 336 assert isinstance(two_a, cl_array.Array) 337 338 two_a = np.float32(2)*a_gpu 339 assert isinstance(two_a, cl_array.Array) 340 341 342def test_pow_array(ctx_factory): 343 context = ctx_factory() 344 queue = cl.CommandQueue(context) 345 346 a = np.array([1, 2, 3, 4, 5]).astype(np.float32) 347 a_gpu = cl_array.to_device(queue, a) 348 349 result = pow(a_gpu, a_gpu).get() 350 assert (np.abs(a ** a - result) < 3e-3).all() 351 352 result = (a_gpu ** a_gpu).get() 353 assert (np.abs(pow(a, a) - result) < 3e-3).all() 354 355 356def test_pow_number(ctx_factory): 357 context = ctx_factory() 358 queue = cl.CommandQueue(context) 359 360 a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) 361 a_gpu = cl_array.to_device(queue, a) 362 363 result = pow(a_gpu, 2).get() 364 assert (np.abs(a ** 2 - result) < 1e-3).all() 365 366 367def test_multiply(ctx_factory): 368 """Test the muliplication of an array with a scalar. """ 369 370 context = ctx_factory() 371 queue = cl.CommandQueue(context) 372 373 for sz in [10, 50000]: 374 for dtype, scalars in [ 375 (np.float32, [2]), 376 (np.complex64, [2j]), 377 ]: 378 for scalar in scalars: 379 a_gpu = make_random_array(queue, dtype, sz) 380 a = a_gpu.get() 381 a_mult = (scalar * a_gpu).get() 382 383 assert (a * scalar == a_mult).all() 384 385 386def test_multiply_array(ctx_factory): 387 """Test the multiplication of two arrays.""" 388 389 context = ctx_factory() 390 queue = cl.CommandQueue(context) 391 392 a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) 393 394 a_gpu = cl_array.to_device(queue, a) 395 b_gpu = cl_array.to_device(queue, a) 396 397 a_squared = (b_gpu * a_gpu).get() 398 399 assert (a * a == a_squared).all() 400 401 402def test_addition_array(ctx_factory): 403 """Test the addition of two arrays.""" 404 405 context = ctx_factory() 406 queue = cl.CommandQueue(context) 407 408 a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) 409 a_gpu = cl_array.to_device(queue, a) 410 a_added = (a_gpu + a_gpu).get() 411 412 assert (a + a == a_added).all() 413 414 415def test_addition_scalar(ctx_factory): 416 """Test the addition of an array and a scalar.""" 417 418 context = ctx_factory() 419 queue = cl.CommandQueue(context) 420 421 a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) 422 a_gpu = cl_array.to_device(queue, a) 423 a_added = (7 + a_gpu).get() 424 425 assert (7 + a == a_added).all() 426 427 428def test_substract_array(ctx_factory): 429 """Test the substraction of two arrays.""" 430 #test data 431 a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) 432 b = np.array([10, 20, 30, 40, 50, 433 60, 70, 80, 90, 100]).astype(np.float32) 434 435 context = ctx_factory() 436 queue = cl.CommandQueue(context) 437 438 a_gpu = cl_array.to_device(queue, a) 439 b_gpu = cl_array.to_device(queue, b) 440 441 result = (a_gpu - b_gpu).get() 442 assert (a - b == result).all() 443 444 result = (b_gpu - a_gpu).get() 445 assert (b - a == result).all() 446 447 448def test_substract_scalar(ctx_factory): 449 """Test the substraction of an array and a scalar.""" 450 451 context = ctx_factory() 452 queue = cl.CommandQueue(context) 453 454 #test data 455 a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) 456 457 #convert a to a gpu object 458 a_gpu = cl_array.to_device(queue, a) 459 460 result = (a_gpu - 7).get() 461 assert (a - 7 == result).all() 462 463 result = (7 - a_gpu).get() 464 assert (7 - a == result).all() 465 466 467def test_divide_scalar(ctx_factory): 468 """Test the division of an array and a scalar.""" 469 470 context = ctx_factory() 471 queue = cl.CommandQueue(context) 472 473 a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) 474 a_gpu = cl_array.to_device(queue, a) 475 476 result = (a_gpu / 2).get() 477 assert (a / 2 == result).all() 478 479 result = (2 / a_gpu).get() 480 assert (np.abs(2 / a - result) < 1e-5).all() 481 482 483def test_divide_array(ctx_factory): 484 """Test the division of an array and a scalar. """ 485 486 context = ctx_factory() 487 queue = cl.CommandQueue(context) 488 489 #test data 490 a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) 491 b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32) 492 493 a_gpu = cl_array.to_device(queue, a) 494 b_gpu = cl_array.to_device(queue, b) 495 496 a_divide = (a_gpu / b_gpu).get() 497 assert (np.abs(a / b - a_divide) < 1e-3).all() 498 499 a_divide = (b_gpu / a_gpu).get() 500 assert (np.abs(b / a - a_divide) < 1e-3).all() 501 502 503def test_bitwise(ctx_factory): 504 if _PYPY: 505 pytest.xfail("numpypy: missing bitwise ops") 506 507 context = ctx_factory() 508 queue = cl.CommandQueue(context) 509 510 from itertools import product 511 512 dtypes = [np.dtype(t) for t in (np.int64, np.int32, np.int16, np.int8)] 513 514 from pyopencl.clrandom import rand as clrand 515 516 for a_dtype, b_dtype in product(dtypes, dtypes): 517 ary_len = 16 518 519 np.random.seed(10) 520 521 int32_min = np.iinfo(np.int32).min 522 int32_max = np.iinfo(np.int32).max 523 524 a_dev = clrand( 525 queue, (ary_len,), a=int32_min, b=1+int32_max, dtype=np.int64 526 ).astype(a_dtype) 527 b_dev = clrand( 528 queue, (ary_len,), a=int32_min, b=1+int32_max, dtype=np.int64 529 ).astype(b_dtype) 530 531 a = a_dev.get() 532 b = b_dev.get() 533 s = int((clrand(queue, (), a=int32_min, b=1+int32_max, dtype=np.int64) 534 .astype(b_dtype).get())) 535 536 import operator as o 537 538 for op in [o.and_, o.or_, o.xor]: 539 res_dev = op(a_dev, b_dev) 540 res = op(a, b) 541 542 assert (res_dev.get() == res).all() 543 544 res_dev = op(a_dev, s) 545 res = op(a, s) 546 547 assert (res_dev.get() == res).all() 548 549 res_dev = op(s, b_dev) 550 res = op(s, b) 551 552 assert (res_dev.get() == res).all() 553 554 for op in [o.iand, o.ior, o.ixor]: 555 res_dev = a_dev.copy() 556 op_res = op(res_dev, b_dev) 557 assert op_res is res_dev 558 559 res = a.copy() 560 op(res, b) 561 562 assert (res_dev.get() == res).all() 563 564 res_dev = a_dev.copy() 565 op_res = op(res_dev, s) 566 assert op_res is res_dev 567 res = a.copy() 568 op(res, s) 569 570 assert (res_dev.get() == res).all() 571 572 # Test unary ~ 573 res_dev = ~a_dev 574 res = ~a 575 assert (res_dev.get() == res).all() 576 577# }}} 578 579 580# {{{ RNG 581 582@pytest.mark.parametrize("rng_class", 583 [RanluxGenerator, PhiloxGenerator, ThreefryGenerator]) 584@pytest.mark.parametrize("ary_size", [300, 301, 302, 303, 10007, 1000000]) 585def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False): 586 context = ctx_factory() 587 queue = cl.CommandQueue(context) 588 589 if has_double_support(context.devices[0]): 590 dtypes = [np.float32, np.float64] 591 else: 592 dtypes = [np.float32] 593 594 if rng_class is RanluxGenerator: 595 gen = rng_class(queue, 5120) 596 else: 597 gen = rng_class(context) 598 599 for dtype in dtypes: 600 print(dtype) 601 ran = cl_array.zeros(queue, ary_size, dtype) 602 gen.fill_uniform(ran) 603 604 if plot_hist: 605 import matplotlib.pyplot as pt 606 pt.hist(ran.get(), 30) 607 pt.show() 608 609 assert (0 <= ran.get()).all() 610 assert (ran.get() <= 1).all() 611 612 if rng_class is RanluxGenerator: 613 gen.synchronize(queue) 614 615 ran = cl_array.zeros(queue, ary_size, dtype) 616 gen.fill_uniform(ran, a=4, b=7) 617 ran_host = ran.get() 618 619 for cond in [4 <= ran_host, ran_host <= 7]: 620 good = cond.all() 621 if not good: 622 print(np.where(~cond)) 623 print(ran_host[~cond]) 624 assert good 625 626 ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3) 627 628 if plot_hist: 629 import matplotlib.pyplot as pt 630 pt.hist(ran.get(), 30) 631 pt.show() 632 633 634@pytest.mark.parametrize("dtype", [np.int32, np.int64]) 635@pytest.mark.parametrize("rng_class", 636 [RanluxGenerator, PhiloxGenerator, ThreefryGenerator]) 637def test_random_int_in_range(ctx_factory, rng_class, dtype, plot_hist=False): 638 context = ctx_factory() 639 queue = cl.CommandQueue(context) 640 641 if rng_class is RanluxGenerator: 642 gen = rng_class(queue, 5120) 643 else: 644 gen = rng_class(context) 645 646 # if (dtype == np.int64 647 # and context.devices[0].platform.vendor.startswith("Advanced Micro")): 648 # pytest.xfail("AMD miscompiles 64-bit RNG math") 649 650 ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300).get() 651 assert (200 <= ran).all() 652 assert (ran < 300).all() 653 654 print(np.min(ran), np.max(ran)) 655 assert np.max(ran) > 295 656 657 if plot_hist: 658 from matplotlib import pyplot as pt 659 pt.hist(ran) 660 pt.show() 661 662# }}} 663 664 665# {{{ misc 666 667def test_numpy_integer_shape(ctx_factory): 668 try: 669 list(np.int32(17)) 670 except Exception: 671 pass 672 else: 673 from pytest import skip 674 skip("numpy implementation does not handle scalar correctly.") 675 context = ctx_factory() 676 queue = cl.CommandQueue(context) 677 678 cl_array.empty(queue, np.int32(17), np.float32) 679 cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32) 680 681 682def test_len(ctx_factory): 683 context = ctx_factory() 684 queue = cl.CommandQueue(context) 685 686 a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) 687 a_cpu = cl_array.to_device(queue, a) 688 assert len(a_cpu) == 10 689 690 691def test_stride_preservation(ctx_factory): 692 if _PYPY: 693 pytest.xfail("numpypy: no array creation from __array_interface__") 694 695 context = ctx_factory() 696 queue = cl.CommandQueue(context) 697 698 a = np.random.rand(3, 3) 699 at = a.T 700 print(at.flags.f_contiguous, at.flags.c_contiguous) 701 at_gpu = cl_array.to_device(queue, at) 702 print(at_gpu.flags.f_contiguous, at_gpu.flags.c_contiguous) 703 assert np.allclose(at_gpu.get(), at) 704 705 706def test_nan_arithmetic(ctx_factory): 707 context = ctx_factory() 708 queue = cl.CommandQueue(context) 709 710 def make_nan_contaminated_vector(size): 711 shape = (size,) 712 a = np.random.randn(*shape).astype(np.float32) 713 from random import randrange 714 for i in range(size // 10): 715 a[randrange(0, size)] = float('nan') 716 return a 717 718 size = 1 << 20 719 720 a = make_nan_contaminated_vector(size) 721 a_gpu = cl_array.to_device(queue, a) 722 b = make_nan_contaminated_vector(size) 723 b_gpu = cl_array.to_device(queue, b) 724 725 ab = a * b 726 ab_gpu = (a_gpu * b_gpu).get() 727 728 assert (np.isnan(ab) == np.isnan(ab_gpu)).all() 729 730 731def test_mem_pool_with_arrays(ctx_factory): 732 context = ctx_factory() 733 queue = cl.CommandQueue(context) 734 mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) 735 736 a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool) 737 b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000 738 739 assert a_dev.allocator is mem_pool 740 assert b_dev.allocator is mem_pool 741 742 743def test_view(ctx_factory): 744 context = ctx_factory() 745 queue = cl.CommandQueue(context) 746 747 a = np.arange(128).reshape(8, 16).astype(np.float32) 748 a_dev = cl_array.to_device(queue, a) 749 750 # same dtype 751 view = a_dev.view() 752 assert view.shape == a_dev.shape and view.dtype == a_dev.dtype 753 754 # larger dtype 755 view = a_dev.view(np.complex64) 756 assert view.shape == (8, 8) and view.dtype == np.complex64 757 758 # smaller dtype 759 view = a_dev.view(np.int16) 760 assert view.shape == (8, 32) and view.dtype == np.int16 761 762 763def test_diff(ctx_factory): 764 context = ctx_factory() 765 queue = cl.CommandQueue(context) 766 767 from pyopencl.clrandom import rand as clrand 768 769 ary_len = 20000 770 a_dev = clrand(queue, (ary_len,), dtype=np.float32) 771 a = a_dev.get() 772 773 err = la.norm( 774 (cl.array.diff(a_dev).get() - np.diff(a))) 775 assert err < 1e-4 776 777 778def test_copy(ctx_factory): 779 context = ctx_factory() 780 queue1 = cl.CommandQueue(context) 781 queue2 = cl.CommandQueue(context) 782 783 # Test copy 784 785 arr = cl.array.zeros(queue1, 100, np.int32) 786 arr_copy = arr.copy() 787 788 assert (arr == arr_copy).all().get() 789 assert arr.data != arr_copy.data 790 assert arr_copy.queue is queue1 791 792 # Test queue association 793 794 arr_copy = arr.copy(queue=queue2) 795 assert arr_copy.queue is queue2 796 797 arr_copy = arr.copy(queue=None) 798 assert arr_copy.queue is None 799 800 arr_copy = arr.with_queue(None).copy(queue=queue1) 801 assert arr_copy.queue is queue1 802 803# }}} 804 805 806# {{{ slices, concatenation 807 808def test_slice(ctx_factory): 809 if _PYPY: 810 pytest.xfail("numpypy: spurious as_strided failure") 811 812 context = ctx_factory() 813 queue = cl.CommandQueue(context) 814 815 from pyopencl.clrandom import rand as clrand 816 817 tp = np.float32 818 819 ary_len = 20000 820 a_gpu = clrand(queue, (ary_len,), dtype=tp) 821 b_gpu = clrand(queue, (ary_len,), dtype=tp) 822 a = a_gpu.get() 823 b = b_gpu.get() 824 825 from random import randrange 826 for i in range(20): 827 start = randrange(ary_len) 828 end = randrange(start, ary_len) 829 830 a_gpu_slice = tp(2)*a_gpu[start:end] 831 a_slice = tp(2)*a[start:end] 832 833 assert la.norm(a_gpu_slice.get() - a_slice) == 0 834 835 for i in range(20): 836 start = randrange(ary_len) 837 end = randrange(start, ary_len) 838 839 a_gpu[start:end] = tp(2)*b[start:end] 840 a[start:end] = tp(2)*b[start:end] 841 842 assert la.norm(a_gpu.get() - a) == 0 843 844 for i in range(20): 845 start = randrange(ary_len) 846 end = randrange(start, ary_len) 847 848 a_gpu[start:end] = tp(2)*b_gpu[start:end] 849 a[start:end] = tp(2)*b[start:end] 850 851 assert la.norm(a_gpu.get() - a) == 0 852 853 854def test_concatenate(ctx_factory): 855 context = ctx_factory() 856 queue = cl.CommandQueue(context) 857 858 from pyopencl.clrandom import rand as clrand 859 860 a_dev = clrand(queue, (5, 15, 20), dtype=np.float32) 861 b_dev = clrand(queue, (4, 15, 20), dtype=np.float32) 862 c_dev = clrand(queue, (3, 15, 20), dtype=np.float32) 863 a = a_dev.get() 864 b = b_dev.get() 865 c = c_dev.get() 866 867 cat_dev = cl.array.concatenate((a_dev, b_dev, c_dev)) 868 cat = np.concatenate((a, b, c)) 869 870 assert la.norm(cat - cat_dev.get()) == 0 871 872# }}} 873 874 875# {{{ conditionals, any, all 876 877def test_comparisons(ctx_factory): 878 context = ctx_factory() 879 queue = cl.CommandQueue(context) 880 881 from pyopencl.clrandom import rand as clrand 882 883 ary_len = 20000 884 a_dev = clrand(queue, (ary_len,), dtype=np.float32) 885 b_dev = clrand(queue, (ary_len,), dtype=np.float32) 886 887 a = a_dev.get() 888 b = b_dev.get() 889 890 import operator as o 891 for op in [o.eq, o.ne, o.le, o.lt, o.ge, o.gt]: 892 res_dev = op(a_dev, b_dev) 893 res = op(a, b) 894 895 assert (res_dev.get() == res).all() 896 897 res_dev = op(a_dev, 0) 898 res = op(a, 0) 899 900 assert (res_dev.get() == res).all() 901 902 res_dev = op(0, b_dev) 903 res = op(0, b) 904 905 assert (res_dev.get() == res).all() 906 907 908def test_any_all(ctx_factory): 909 context = ctx_factory() 910 queue = cl.CommandQueue(context) 911 912 ary_len = 20000 913 a_dev = cl_array.zeros(queue, (ary_len,), dtype=np.int8) 914 915 assert not a_dev.all().get() 916 assert not a_dev.any().get() 917 918 a_dev[15213] = 1 919 920 assert not a_dev.all().get() 921 assert a_dev.any().get() 922 923 a_dev.fill(1) 924 925 assert a_dev.all().get() 926 assert a_dev.any().get() 927 928# }}} 929 930 931def test_map_to_host(ctx_factory): 932 if _PYPY: 933 pytest.skip("numpypy: no array creation from __array_interface__") 934 935 context = ctx_factory() 936 queue = cl.CommandQueue(context) 937 938 if context.devices[0].type & cl.device_type.GPU: 939 mf = cl.mem_flags 940 allocator = cl_tools.DeferredAllocator( 941 context, mf.READ_WRITE | mf.ALLOC_HOST_PTR) 942 else: 943 allocator = None 944 945 a_dev = cl_array.zeros(queue, (5, 6, 7,), dtype=np.float32, allocator=allocator) 946 a_dev[3, 2, 1] = 10 947 a_host = a_dev.map_to_host() 948 a_host[1, 2, 3] = 10 949 950 a_host_saved = a_host.copy() 951 a_host.base.release(queue) 952 953 a_dev.finish() 954 955 print("DEV[HOST_WRITE]", a_dev.get()[1, 2, 3]) 956 print("HOST[DEV_WRITE]", a_host_saved[3, 2, 1]) 957 958 assert (a_host_saved == a_dev.get()).all() 959 960 961def test_view_and_strides(ctx_factory): 962 if _PYPY: 963 pytest.xfail("numpypy: no array creation from __array_interface__") 964 return 965 966 context = ctx_factory() 967 queue = cl.CommandQueue(context) 968 969 from pyopencl.clrandom import rand as clrand 970 971 x = clrand(queue, (5, 10), dtype=np.float32) 972 y = x[:3, :5] 973 yv = y.view() 974 975 assert yv.shape == y.shape 976 assert yv.strides == y.strides 977 978 with pytest.raises(AssertionError): 979 assert (yv.get() == x.get()[:3, :5]).all() 980 981 982def test_meshmode_view(ctx_factory): 983 if _PYPY: 984 # https://bitbucket.org/pypy/numpy/issue/28/indexerror-on-ellipsis-slice 985 pytest.xfail("numpypy bug #28") 986 987 context = ctx_factory() 988 queue = cl.CommandQueue(context) 989 990 n = 2 991 result = cl.array.empty(queue, (2, n*6), np.float32) 992 993 def view(z): 994 return z[..., n*3:n*6].reshape(z.shape[:-1] + (n, 3)) 995 996 result = result.with_queue(queue) 997 result.fill(0) 998 view(result)[0].fill(1) 999 view(result)[1].fill(1) 1000 x = result.get() 1001 assert (view(x) == 1).all() 1002 1003 1004def test_event_management(ctx_factory): 1005 context = ctx_factory() 1006 queue = cl.CommandQueue(context) 1007 1008 from pyopencl.clrandom import rand as clrand 1009 1010 x = clrand(queue, (5, 10), dtype=np.float32) 1011 assert len(x.events) == 1, len(x._events) 1012 1013 x.finish() 1014 1015 assert len(x.events) == 0 1016 1017 y = x+x 1018 assert len(y.events) == 1 1019 y = x*x 1020 assert len(y.events) == 1 1021 y = 2*x 1022 assert len(y.events) == 1 1023 y = 2/x 1024 assert len(y.events) == 1 1025 y = x/2 1026 assert len(y.events) == 1 1027 y = x**2 1028 assert len(y.events) == 1 1029 y = 2**x 1030 assert len(y.events) == 1 1031 1032 for i in range(10): 1033 x.fill(0) 1034 1035 assert len(x.events) == 10 1036 1037 for i in range(1000): 1038 x.fill(0) 1039 1040 assert len(x.events) < 100 1041 1042 1043def test_reshape(ctx_factory): 1044 context = ctx_factory() 1045 queue = cl.CommandQueue(context) 1046 1047 a = np.arange(128).reshape(8, 16).astype(np.float32) 1048 a_dev = cl_array.to_device(queue, a) 1049 1050 # different ways to specify the shape 1051 a_dev.reshape(4, 32) 1052 a_dev.reshape((4, 32)) 1053 a_dev.reshape([4, 32]) 1054 1055 # using -1 as unknown dimension 1056 assert a_dev.reshape(-1, 32).shape == (4, 32) 1057 assert a_dev.reshape((32, -1)).shape == (32, 4) 1058 assert a_dev.reshape(((8, -1, 4))).shape == (8, 4, 4) 1059 1060 import pytest 1061 with pytest.raises(ValueError): 1062 a_dev.reshape(-1, -1, 4) 1063 1064 1065def test_skip_slicing(ctx_factory): 1066 context = ctx_factory() 1067 queue = cl.CommandQueue(context) 1068 1069 a_host = np.arange(16).reshape((4, 4)) 1070 b_host = a_host[::3] 1071 1072 a = cl_array.to_device(queue, a_host) 1073 b = a[::3] 1074 assert b.shape == b_host.shape 1075 assert np.array_equal(b[1].get(), b_host[1]) 1076 1077 1078def test_transpose(ctx_factory): 1079 if _PYPY: 1080 pytest.xfail("numpypy: no array creation from __array_interface__") 1081 1082 context = ctx_factory() 1083 queue = cl.CommandQueue(context) 1084 1085 from pyopencl.clrandom import rand as clrand 1086 1087 a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32) 1088 a = a_gpu.get() 1089 1090 # FIXME: not contiguous 1091 #assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0))) 1092 assert np.array_equal(a_gpu.T.get(), a.T) 1093 1094 1095def test_newaxis(ctx_factory): 1096 context = ctx_factory() 1097 queue = cl.CommandQueue(context) 1098 1099 from pyopencl.clrandom import rand as clrand 1100 1101 a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32) 1102 a = a_gpu.get() 1103 1104 b_gpu = a_gpu[:, np.newaxis] 1105 b = a[:, np.newaxis] 1106 1107 assert b_gpu.shape == b.shape 1108 for i in range(b.ndim): 1109 if b.shape[i] > 1: 1110 assert b_gpu.strides[i] == b.strides[i] 1111 1112 1113def test_squeeze(ctx_factory): 1114 context = ctx_factory() 1115 queue = cl.CommandQueue(context) 1116 1117 shape = (40, 2, 5, 100) 1118 a_cpu = np.random.random(size=shape) 1119 a_gpu = cl_array.to_device(queue, a_cpu) 1120 1121 # Slice with length 1 on dimensions 0 and 1 1122 a_gpu_slice = a_gpu[0:1, 1:2, :, :] 1123 assert a_gpu_slice.shape == (1, 1, shape[2], shape[3]) 1124 assert a_gpu_slice.flags.c_contiguous 1125 1126 # Squeeze it and obtain contiguity 1127 a_gpu_squeezed_slice = a_gpu[0:1, 1:2, :, :].squeeze() 1128 assert a_gpu_squeezed_slice.shape == (shape[2], shape[3]) 1129 assert a_gpu_squeezed_slice.flags.c_contiguous 1130 1131 # Check that we get the original values out 1132 #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel()) 1133 1134 # Slice with length 1 on dimensions 2 1135 a_gpu_slice = a_gpu[:, :, 2:3, :] 1136 assert a_gpu_slice.shape == (shape[0], shape[1], 1, shape[3]) 1137 assert not a_gpu_slice.flags.c_contiguous 1138 1139 # Squeeze it, but no contiguity here 1140 a_gpu_squeezed_slice = a_gpu[:, :, 2:3, :].squeeze() 1141 assert a_gpu_squeezed_slice.shape == (shape[0], shape[1], shape[3]) 1142 assert not a_gpu_squeezed_slice.flags.c_contiguous 1143 1144 # Check that we get the original values out 1145 #assert np.all(a_gpu_slice.get().ravel() == a_gpu_squeezed_slice.get().ravel()) 1146 1147 1148def test_fancy_fill(ctx_factory): 1149 if _PYPY: 1150 pytest.xfail("numpypy: multi value setting is not supported") 1151 context = ctx_factory() 1152 queue = cl.CommandQueue(context) 1153 1154 numpy_dest = np.zeros((4,), np.int32) 1155 numpy_idx = np.arange(3, dtype=np.int32) 1156 numpy_src = np.arange(8, 9, dtype=np.int32) 1157 numpy_dest[numpy_idx] = numpy_src 1158 1159 cl_dest = cl_array.zeros(queue, (4,), np.int32) 1160 cl_idx = cl_array.arange(queue, 3, dtype=np.int32) 1161 cl_src = cl_array.arange(queue, 8, 9, dtype=np.int32) 1162 cl_dest[cl_idx] = cl_src 1163 1164 assert np.all(numpy_dest == cl_dest.get()) 1165 1166 1167def test_fancy_indexing(ctx_factory): 1168 if _PYPY: 1169 pytest.xfail("numpypy: multi value setting is not supported") 1170 context = ctx_factory() 1171 queue = cl.CommandQueue(context) 1172 1173 numpy_dest = np.zeros((4,), np.int32) 1174 numpy_idx = np.arange(3, 0, -1, dtype=np.int32) 1175 numpy_src = np.arange(8, 11, dtype=np.int32) 1176 numpy_dest[numpy_idx] = numpy_src 1177 1178 cl_dest = cl_array.zeros(queue, (4,), np.int32) 1179 cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32) 1180 cl_src = cl_array.arange(queue, 8, 11, dtype=np.int32) 1181 cl_dest[cl_idx] = cl_src 1182 1183 assert np.all(numpy_dest == cl_dest.get()) 1184 1185 cl_idx[1] = 3 1186 cl_idx[2] = 2 1187 1188 numpy_idx[1] = 3 1189 numpy_idx[2] = 2 1190 1191 numpy_dest[numpy_idx] = numpy_src 1192 cl_dest[cl_idx] = cl_src 1193 1194 assert np.all(numpy_dest == cl_dest.get()) 1195 1196 1197def test_multi_put(ctx_factory): 1198 if _PYPY: 1199 pytest.xfail("numpypy: multi value setting is not supported") 1200 1201 context = ctx_factory() 1202 queue = cl.CommandQueue(context) 1203 1204 cl_arrays = [ 1205 cl_array.arange(queue, 0, 3, dtype=np.float32) 1206 for i in range(1, 10) 1207 ] 1208 idx = cl_array.arange(queue, 0, 6, dtype=np.int32) 1209 out_arrays = [ 1210 cl_array.zeros(queue, (10,), np.float32) 1211 for i in range(9) 1212 ] 1213 1214 out_compare = [np.zeros((10,), np.float32) for i in range(9)] 1215 for i, ary in enumerate(out_compare): 1216 ary[idx.get()] = np.arange(0, 6, dtype=np.float32) 1217 1218 cl_array.multi_put(cl_arrays, idx, out=out_arrays) 1219 1220 assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9)) 1221 1222 1223def test_outoforderqueue_get(ctx_factory): 1224 context = ctx_factory() 1225 try: 1226 queue = cl.CommandQueue(context, 1227 properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) 1228 except Exception: 1229 pytest.skip("out-of-order queue not available") 1230 a = np.random.rand(10**6).astype(np.dtype('float32')) 1231 a_gpu = cl_array.to_device(queue, a) 1232 b_gpu = a_gpu + a_gpu**5 + 1 1233 b1 = b_gpu.get() # testing that this waits for events 1234 b = a + a**5 + 1 1235 assert np.abs(b1 - b).mean() < 1e-5 1236 1237 1238def test_outoforderqueue_copy(ctx_factory): 1239 context = ctx_factory() 1240 try: 1241 queue = cl.CommandQueue(context, 1242 properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) 1243 except Exception: 1244 pytest.skip("out-of-order queue not available") 1245 a = np.random.rand(10**6).astype(np.dtype('float32')) 1246 a_gpu = cl_array.to_device(queue, a) 1247 c_gpu = a_gpu**2 - 7 1248 b_gpu = c_gpu.copy() # testing that this waits for and creates events 1249 b_gpu *= 10 1250 queue.finish() 1251 b1 = b_gpu.get() 1252 b = 10 * (a**2 - 7) 1253 assert np.abs(b1 - b).mean() < 1e-5 1254 1255 1256def test_outoforderqueue_indexing(ctx_factory): 1257 context = ctx_factory() 1258 try: 1259 queue = cl.CommandQueue(context, 1260 properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) 1261 except Exception: 1262 pytest.skip("out-of-order queue not available") 1263 a = np.random.rand(10**6).astype(np.dtype('float32')) 1264 i = (8e5 + 1e5 * np.random.rand(10**5)).astype(np.dtype('int32')) 1265 a_gpu = cl_array.to_device(queue, a) 1266 i_gpu = cl_array.to_device(queue, i) 1267 c_gpu = (a_gpu**2)[i_gpu - 10000] 1268 b_gpu = 10 - a_gpu 1269 b_gpu[:] = 8 * a_gpu 1270 b_gpu[i_gpu + 10000] = c_gpu - 10 1271 queue.finish() 1272 b1 = b_gpu.get() 1273 c = (a**2)[i - 10000] 1274 b = 8 * a 1275 b[i + 10000] = c - 10 1276 assert np.abs(b1 - b).mean() < 1e-5 1277 1278 1279def test_outoforderqueue_reductions(ctx_factory): 1280 context = ctx_factory() 1281 try: 1282 queue = cl.CommandQueue(context, 1283 properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) 1284 except Exception: 1285 pytest.skip("out-of-order queue not available") 1286 # 0/1 values to avoid accumulated rounding error 1287 a = (np.random.rand(10**6) > 0.5).astype(np.dtype('float32')) 1288 a[800000] = 10 # all<5 looks true until near the end 1289 a_gpu = cl_array.to_device(queue, a) 1290 b1 = cl_array.sum(a_gpu).get() 1291 b2 = cl_array.dot(a_gpu, 3 - a_gpu).get() 1292 b3 = (a_gpu < 5).all().get() 1293 assert b1 == a.sum() and b2 == a.dot(3 - a) and b3 == 0 1294 1295 1296if __name__ == "__main__": 1297 # make sure that import failures get reported, instead of skipping the 1298 # tests. 1299 if len(sys.argv) > 1: 1300 exec(sys.argv[1]) 1301 else: 1302 from pytest import main 1303 main([__file__]) 1304 1305# vim: filetype=pyopencl:fdm=marker 1306