1 //===--- simple_example.cu - Simple example of using Acxxel ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// This file is a simple example of using Acxxel.
10 ///
11 //===----------------------------------------------------------------------===//
12 
13 /// [Example simple saxpy]
14 #include "acxxel.h"
15 
16 #include <array>
17 #include <cstdio>
18 #include <cstdlib>
19 
20 // A standard CUDA kernel.
saxpyKernel(float A,float * X,float * Y,int N)21 __global__ void saxpyKernel(float A, float *X, float *Y, int N) {
22   int I = (blockDim.x * blockIdx.x) + threadIdx.x;
23   if (I < N)
24     X[I] = A * X[I] + Y[I];
25 }
26 
27 // A host library wrapping the CUDA kernel. All Acxxel calls are in here.
28 template <size_t N>
saxpy(float A,std::array<float,N> & X,const std::array<float,N> & Y)29 void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {
30   // Get the CUDA platform and make a CUDA stream.
31   acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue();
32   acxxel::Stream Stream = CUDA->createStream().takeValue();
33 
34   // Allocate space for device arrays.
35   auto DeviceX = CUDA->mallocD<float>(N).takeValue();
36   auto DeviceY = CUDA->mallocD<float>(N).takeValue();
37 
38   // Copy X and Y out to the device.
39   Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);
40 
41   // Launch the kernel using triple-chevron notation.
42   saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N);
43 
44   // Copy the results back to the host.
45   acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus();
46 
47   // Check for any errors.
48   if (Status.isError()) {
49     std::fprintf(stderr, "Error performing acxxel saxpy: %s\n",
50                  Status.getMessage().c_str());
51     std::exit(EXIT_FAILURE);
52   }
53 }
54 /// [Example simple saxpy]
55 
56 /// [Example CUDA simple saxpy]
57 template <size_t N>
cudaSaxpy(float A,std::array<float,N> & X,std::array<float,N> & Y)58 void cudaSaxpy(float A, std::array<float, N> &X, std::array<float, N> &Y) {
59   // This size is needed all over the place, so give it a name.
60   constexpr size_t Size = N * sizeof(float);
61 
62   // Allocate space for device arrays.
63   float *DeviceX;
64   float *DeviceY;
65   cudaMalloc(&DeviceX, Size);
66   cudaMalloc(&DeviceY, Size);
67 
68   // Copy X and Y out to the device.
69   cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice);
70   cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice);
71 
72   // Launch the kernel using triple-chevron notation.
73   saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N);
74 
75   // Copy the results back to the host.
76   cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost);
77 
78   // Free resources.
79   cudaFree(DeviceX);
80   cudaFree(DeviceY);
81 
82   // Check for any errors.
83   cudaError_t Error = cudaGetLastError();
84   if (Error) {
85     std::fprintf(stderr, "Error performing cudart saxpy: %s\n",
86                  cudaGetErrorString(Error));
87     std::exit(EXIT_FAILURE);
88   }
89 }
90 /// [Example CUDA simple saxpy]
91 
testSaxpy(F && SaxpyFunction)92 template <typename F> void testSaxpy(F &&SaxpyFunction) {
93   float A = 2.f;
94   std::array<float, 3> X = {{0.f, 1.f, 2.f}};
95   std::array<float, 3> Y = {{3.f, 4.f, 5.f}};
96   std::array<float, 3> Expected = {{3.f, 6.f, 9.f}};
97   SaxpyFunction(A, X, Y);
98   for (int I = 0; I < 3; ++I)
99     if (X[I] != Expected[I]) {
100       std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I],
101                    Expected[I]);
102       std::exit(EXIT_FAILURE);
103     }
104 }
105 
main()106 int main() {
107   testSaxpy(saxpy<3>);
108   testSaxpy(cudaSaxpy<3>);
109 }
110