1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
8 // Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
9 // Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
10 // Ye Luo, yeluo@anl.gov, Argonne National Laboratory
11 //
12 // File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
13 //////////////////////////////////////////////////////////////////////////////////////
14
15
16 #ifndef GPU_VECTOR_H
17 #define GPU_VECTOR_H
18
19 #include <malloc.h>
20 #include <iostream>
21 #include <string>
22 #include <map>
23 #include <vector>
24 #include <cmath>
25 #include <cstdlib>
26 #include <cstdio>
27 #include <algorithm>
28
29 #ifdef QMC_CUDA
30 #include <cuda_runtime_api.h>
31 #include "gpu_misc.h"
32 #endif
33
34 namespace gpu
35 {
36 struct gpu_mem_object
37 {
38 size_t num_objects;
39 size_t total_bytes;
gpu_mem_objectgpu_mem_object40 gpu_mem_object(size_t size) : num_objects(1), total_bytes(size) {}
gpu_mem_objectgpu_mem_object41 gpu_mem_object() : num_objects(0), total_bytes(0) {}
42 };
43
44
45 class cuda_memory_manager_type
46 {
47 private:
48 std::map<std::string, gpu_mem_object> gpu_mem_map;
49 std::map<void*, std::pair<std::string, size_t>> gpu_pointer_map;
50
51 public:
52 void* allocate(size_t bytes, std::string name = "");
53 void* allocate_managed(size_t bytes, std::string name = "", unsigned int flags = cudaMemAttachGlobal);
54
55 void deallocate(void* p);
56
57 void report();
58 };
59
60 extern cuda_memory_manager_type cuda_memory_manager;
61
62 template<typename T>
63 class host_vector;
64
65 template<typename T>
66 class device_vector
67 {
68 private:
69 T* data_pointer;
70 size_t current_size, alloc_size;
71 std::string name;
72 // True if the data was allocated by this vector. False if we're
73 // referencing memory
74 bool own_data;
75 // True if managed memory was requested using resize function, starts out false
76 bool managedmem;
77 // Flags for managed memory creation (defaults to cudaMemAttachGlobal) that can be set with set_managed_flags function
78 unsigned int managed_flags;
79
80 public:
81 typedef T* pointer;
82
set_name(std::string n)83 void set_name(std::string n) { name = n; }
84
set_managed_flags(unsigned int flags)85 void set_managed_flags(unsigned int flags) { managed_flags = flags; }
86
device_vector()87 inline device_vector()
88 : data_pointer(NULL),
89 current_size(0),
90 alloc_size(0),
91 own_data(true),
92 managedmem(false),
93 managed_flags(cudaMemAttachGlobal)
94 {}
95
device_vector(std::string myName)96 inline device_vector(std::string myName)
97 : data_pointer(NULL),
98 current_size(0),
99 alloc_size(0),
100 name(myName),
101 own_data(true),
102 managedmem(false),
103 managed_flags(cudaMemAttachGlobal)
104 {}
105
device_vector(size_t size)106 inline device_vector(size_t size)
107 : data_pointer(NULL),
108 current_size(0),
109 alloc_size(0),
110 own_data(true),
111 managedmem(false),
112 managed_flags(cudaMemAttachGlobal)
113 {
114 resize(size);
115 }
116
device_vector(std::string myName,size_t size)117 inline device_vector(std::string myName, size_t size)
118 : name(myName),
119 data_pointer(NULL),
120 current_size(0),
121 alloc_size(0),
122 own_data(true),
123 managedmem(false),
124 managed_flags(cudaMemAttachGlobal)
125 {
126 resize(size);
127 }
128
129 inline device_vector(const host_vector<T>& vec);
130
~device_vector()131 ~device_vector()
132 {
133 if (alloc_size > 0 && data_pointer && own_data)
134 cuda_memory_manager.deallocate(data_pointer);
135 }
136
reference(T * p,size_t size)137 inline void reference(T* p, size_t size)
138 {
139 // fprintf (stderr, "reference called for name=%s size=%ld ptr=%p\n",
140 // name.c_str(), size, p);
141 if (own_data && alloc_size > 0)
142 cuda_memory_manager.deallocate(data_pointer);
143 data_pointer = p;
144 current_size = size;
145 alloc_size = 0;
146 own_data = false;
147 managedmem = false;
148 }
149
150
151 inline T& operator[](size_t i) const { return data_pointer[i]; }
152
153
154 inline void resize(size_t size, double reserve_factor = 1.0, bool managed = false)
155 {
156 if (!size)
157 {
158 current_size = 0;
159 return;
160 }
161 size_t reserve_size = (size_t)std::ceil(reserve_factor * size);
162 size_t byte_size = sizeof(T) * reserve_size;
163 bool error = false;
164 if (managed != managedmem)
165 {
166 if (managedmem)
167 {
168 if (alloc_size > 0) // Only trigger error message if memory is allocated
169 {
170 fprintf(stderr, "device_vector.resize from managed (%p) ", data_pointer);
171 error = true;
172 }
173 }
174 else
175 {
176 if (alloc_size != 0)
177 fprintf(stderr, "device_vector.resize from non-managed to managed.\n");
178 }
179 }
180 if ((size > alloc_size) || (alloc_size == 0))
181 {
182 if (own_data && (alloc_size > 0))
183 cuda_memory_manager.deallocate(data_pointer);
184 if (managed)
185 data_pointer = (T*)cuda_memory_manager.allocate_managed(byte_size, name, managed_flags);
186 else
187 data_pointer = (T*)cuda_memory_manager.allocate(byte_size, name);
188 alloc_size = reserve_size;
189 own_data = true;
190 managedmem = managed;
191 }
192 current_size = size;
193 if (error)
194 fprintf(stderr, "to non-managed (%p).\n", data_pointer);
195 }
196
clear()197 inline void clear()
198 {
199 if (alloc_size)
200 {
201 cuda_memory_manager.deallocate(data_pointer);
202 data_pointer = NULL;
203 current_size = alloc_size = 0;
204 }
205 }
206
size()207 inline size_t size() const { return current_size; }
208
209
210 inline device_vector& operator=(const device_vector<T>& vec)
211 {
212 if (this->size() != vec.size())
213 {
214 if (!own_data)
215 {
216 fprintf(stderr,
217 "Assigning referenced GPU vector, but it has the "
218 "wrong size.\n");
219 fprintf(stderr, "Name = %s. This size = %ld, vec size = %ld\n", name.c_str(), size(), vec.size());
220 abort();
221 }
222 resize(vec.size(), 1.0, managedmem);
223 }
224 #ifdef QMC_CUDA
225 cudaMemcpyAsync(data_pointer, &(vec[0]), this->size() * sizeof(T), cudaMemcpyDeviceToDevice);
226 cudaError_t err = cudaGetLastError();
227 if (err != cudaSuccess)
228 {
229 fprintf(stderr, "CUDA error in device_vector::operator=(device_vector):\n %s\n", cudaGetErrorString(err));
230 fprintf(stderr, "vec.size() = %ld\n", vec.size());
231 abort();
232 }
233 #endif
234 return *this;
235 }
236
device_vector(const device_vector<T> & vec)237 device_vector(const device_vector<T>& vec)
238 : data_pointer(NULL),
239 current_size(0),
240 alloc_size(0),
241 name(vec.name),
242 own_data(true),
243 managedmem(vec.managedmem),
244 managed_flags(vec.managed_flags)
245 {
246 resize(vec.size(), 1.0, managedmem);
247 // fprintf(stderr, "device_vector copy constructor called, name=%s.\n",
248 // name.c_str());
249 #ifdef QMC_CUDA
250 if (this->size() != 0)
251 {
252 cudaMemcpy(data_pointer, &(vec[0]), vec.size() * sizeof(T), cudaMemcpyDeviceToDevice);
253 cudaError_t err = cudaGetLastError();
254 if (err != cudaSuccess)
255 {
256 fprintf(stderr, "CUDA error in device_vector::copy constructor:\n %s\n", cudaGetErrorString(err));
257 abort();
258 }
259 }
260 #endif
261 }
262
263 device_vector& operator=(const std::vector<T, std::allocator<T>>& vec)
264 {
265 if (this->size() != vec.size())
266 {
267 if (!own_data)
268 {
269 fprintf(stderr,
270 "Assigning referenced GPU vector, but it has the "
271 "wrong size.\n");
272 // fprintf (stderr, "Name = %s. This size = %ld, vec size = %ld\n",
273 // name.c_str(), size(), vec.size());
274 abort();
275 }
276 resize(vec.size(), 1.0, managedmem);
277 }
278 #ifdef QMC_CUDA
279 cudaMemcpyAsync(data_pointer, &(vec[0]), this->size() * sizeof(T), cudaMemcpyHostToDevice);
280 cudaError_t err = cudaGetLastError();
281 if (err != cudaSuccess)
282 {
283 fprintf(stderr, "CUDA error in device_vector::operator (%p)=(std::vector %p):\n %s\n", data_pointer, &(vec[0]),
284 cudaGetErrorString(err));
285 abort();
286 }
287 #endif
288 return *this;
289 }
290
291 device_vector& operator=(const host_vector<T>& vec)
292 {
293 if (this->size() != vec.size())
294 {
295 if (!own_data)
296 {
297 fprintf(stderr,
298 "Assigning referenced GPU vector, but it has the "
299 "wrong size.\n");
300 fprintf(stderr, "Name = %s. This size = %ld, vec size = %ld\n", name.c_str(), size(), vec.size());
301 abort();
302 }
303 resize(vec.size(), 1.0, managedmem);
304 }
305 #ifdef QMC_CUDA
306 cudaMemcpy(&((*this)[0]), &(vec[0]), vec.size() * sizeof(T), cudaMemcpyHostToDevice);
307 cudaError_t err = cudaGetLastError();
308 if (err != cudaSuccess)
309 {
310 fprintf(stderr, "In operator=, name=%s, size=%ld vec.size()=%ld\n", name.c_str(), size(), vec.size());
311 fprintf(stderr, "this pointer = %p vec pointer=%p\n", data_pointer, &(vec[0]));
312 fprintf(stderr, "CUDA error in device_vector::operator=(const host_vector<T> &vec) for %s:\n %s\n", name.c_str(),
313 cudaGetErrorString(err));
314 abort();
315 }
316 #endif
317 return *this;
318 }
319
asyncCopy(const host_vector<T> & vec)320 void asyncCopy(const host_vector<T>& vec)
321 {
322 if (this->size() != vec.size())
323 {
324 if (!own_data)
325 {
326 fprintf(stderr,
327 "Assigning referenced GPU vector, but it has the "
328 "wrong size.\n");
329 fprintf(stderr, "Name = %s. This size = %ld, vec size = %ld\n", name.c_str(), size(), vec.size());
330 abort();
331 }
332 resize(vec.size(), 1.0, managedmem);
333 }
334 #ifdef QMC_CUDA
335 cudaMemcpyAsync(&((*this)[0]), &(vec[0]), vec.size() * sizeof(T), cudaMemcpyHostToDevice, kernelStream);
336 cudaError_t err = cudaGetLastError();
337 if (err != cudaSuccess)
338 {
339 fprintf(stderr, "In operator=, name=%s, size=%ld vec.size()=%ld\n", name.c_str(), size(), vec.size());
340 fprintf(stderr, "this pointer = %p vec pointer=%p\n", data_pointer, &(vec[0]));
341 fprintf(stderr, "CUDA error in device_vector::asyncCopy(const host_vector<T> &vec) for %s:\n %s\n", name.c_str(),
342 cudaGetErrorString(err));
343 abort();
344 }
345 #endif
346 }
347
asyncCopy(const T * vec_ptr,size_t len,size_t offset,size_t datalen)348 void asyncCopy(const T* vec_ptr, size_t len, size_t offset, size_t datalen)
349 {
350 if ((this->size() != len) || (this->size() < offset + datalen))
351 {
352 if (!own_data)
353 {
354 fprintf(stderr,
355 "Assigning referenced GPU vector, but it has the "
356 "wrong size.\n");
357 fprintf(stderr, "Name = %s. This size = %ld, vec size = %ld\n", name.c_str(), size(), len);
358 abort();
359 }
360 if (len < offset + datalen)
361 {
362 fprintf(stderr, "Trying to write more than the length of the vector.\n");
363 fprintf(stderr, "Name = %s. This size = %ld, vec size = %ld, needed length = %ld\n", name.c_str(), size(), len,
364 offset + datalen);
365 abort();
366 }
367 resize(len);
368 }
369 #ifdef QMC_CUDA
370 cudaMemcpyAsync(&((*this)[offset]), vec_ptr, datalen * sizeof(T), cudaMemcpyHostToDevice, kernelStream);
371 cudaError_t err = cudaGetLastError();
372 if (err != cudaSuccess)
373 {
374 fprintf(stderr, "In operator=, name=%s, size=%ld vec.size()=%ld\n", name.c_str(), size(), len);
375 fprintf(stderr, "this pointer = %p vec pointer=%p\n", data_pointer, vec_ptr);
376 fprintf(stderr, "CUDA error in device_vector::asyncCopy(const T* vec_ptr, len, offset, datalen) for %s:\n %s\n",
377 name.c_str(), cudaGetErrorString(err));
378 abort();
379 }
380 #endif
381 }
382
asyncCopy(const std::vector<T,std::allocator<T>> & vec)383 void asyncCopy(const std::vector<T, std::allocator<T>>& vec)
384 {
385 if (this->size() != vec.size())
386 {
387 if (!own_data)
388 {
389 fprintf(stderr,
390 "Assigning referenced GPU vector, but it has the "
391 "wrong size.\n");
392 fprintf(stderr, "Name = %s. This size = %ld, vec size = %ld\n", name.c_str(), size(), vec.size());
393 abort();
394 }
395 resize(vec.size(), 1.0, managedmem);
396 }
397 #ifdef QMC_CUDA
398 cudaMemcpyAsync(&((*this)[0]), &(vec[0]), vec.size() * sizeof(T), cudaMemcpyHostToDevice, kernelStream);
399 cudaError_t err = cudaGetLastError();
400 if (err != cudaSuccess)
401 {
402 fprintf(stderr, "In operator=, name=%s, size=%ld vec.size()=%ld\n", name.c_str(), size(), vec.size());
403 fprintf(stderr, "this pointer = %p vec pointer=%p\n", data_pointer, &(vec[0]));
404 fprintf(stderr,
405 "CUDA error in device_vector::asyncCopy(const std::vector<T, std::allocator<T> > &vec) for %s:\n %s\n",
406 name.c_str(), cudaGetErrorString(err));
407 abort();
408 }
409 #endif
410 }
copyFromGPU(std::vector<T,std::allocator<T>> & vec)411 void copyFromGPU(std::vector<T, std::allocator<T>>& vec)
412 {
413 if (this->size() != vec.size())
414 {
415 vec.resize(size());
416 }
417 #ifdef QMC_CUDA
418 cudaMemcpy(&(vec[0]), &((*this)[0]), vec.size() * sizeof(T), cudaMemcpyDeviceToHost);
419 cudaError_t err = cudaGetLastError();
420 if (err != cudaSuccess)
421 {
422 fprintf(stderr, "In operator=, name=%s, size=%ld vec.size()=%ld\n", name.c_str(), size(), vec.size());
423 fprintf(stderr, "this pointer = %p vec pointer=%p\n", data_pointer, &(vec[0]));
424 fprintf(stderr,
425 "CUDA error in device_vector::copyFromGPU(std::vector<T, std::allocator<T> > &vec) for %s:\n %s\n",
426 name.c_str(), cudaGetErrorString(err));
427 abort();
428 }
429 #endif
430 }
431
432
data()433 inline T* data() const { return data_pointer; }
434 };
435
436
437 template<typename T>
438 class host_vector
439 {
440 private:
441 T* data;
442 size_t current_size;
443 size_t capacity;
444
445 public:
host_vector()446 host_vector()
447 {
448 data = NULL;
449 current_size = 0;
450 capacity = 0;
451 }
452
host_vector(const host_vector<T> & vec)453 host_vector(const host_vector<T>& vec)
454 {
455 if (vec.size() != 0)
456 {
457 cudaHostAlloc((void**)&data, vec.size() * sizeof(T), 0);
458 cudaMemcpy(data, vec.data, vec.size() * sizeof(T), cudaMemcpyHostToHost);
459 }
460 else
461 {
462 data = NULL;
463 }
464 current_size = vec.size();
465 capacity = current_size;
466 }
467
host_vector(int size)468 host_vector(int size)
469 {
470 data = NULL;
471 current_size = 0;
472 capacity = 0;
473 resize(size);
474 }
475
~host_vector()476 ~host_vector()
477 {
478 if (data)
479 {
480 cudaFreeHost(data);
481 data = NULL;
482 current_size = 0;
483 capacity = 0;
484 }
485 }
486
host_vector(const device_vector<T> & vec)487 host_vector(const device_vector<T>& vec)
488 {
489 data = NULL;
490 current_size = 0;
491 capacity = 0;
492 resize(vec.size());
493 #ifdef QMC_CUDA
494 cudaMemcpy(&(data[0]), &(vec[0]), current_size * sizeof(T), cudaMemcpyDeviceToHost);
495 cudaError_t err = cudaGetLastError();
496 if (err != cudaSuccess)
497 {
498 fprintf(stderr, "CUDA error in host_vector::copy constructor():\n %s\n", cudaGetErrorString(err));
499 abort();
500 }
501 #endif
502 }
503
504
505 host_vector& operator=(const host_vector<T>& vec)
506 {
507 if (this->size() != vec.size())
508 this->resize(vec.size());
509 #ifdef QMC_CUDA
510 cudaMemcpyAsync(&((*this)[0]), &(vec[0]), this->size() * sizeof(T), cudaMemcpyHostToDevice);
511 cudaError_t err = cudaGetLastError();
512 if (err != cudaSuccess)
513 {
514 fprintf(stderr, "CUDA error in host_vector::operator=(host_vector):\n %s\n", cudaGetErrorString(err));
515 abort();
516 }
517 #endif
518 return *this;
519 }
520
521 host_vector& operator=(const device_vector<T>& vec)
522 {
523 if (this->size() != vec.size())
524 this->resize(vec.size());
525 #ifdef QMC_CUDA
526 cudaMemcpy(&((*this)[0]), &(vec[0]), this->size() * sizeof(T), cudaMemcpyDeviceToHost);
527 cudaError_t err = cudaGetLastError();
528 if (err != cudaSuccess)
529 {
530 fprintf(stderr, "CUDA error in host_vector::operator=(device_vector %p):\n %s\n", &(vec[0]),
531 cudaGetErrorString(err));
532 abort();
533 }
534 #endif
535 return *this;
536 }
537
asyncCopy(const device_vector<T> & vec)538 void asyncCopy(const device_vector<T>& vec)
539 {
540 if (this->size() != vec.size())
541 resize(vec.size());
542 #ifdef QMC_CUDA
543 cudaMemcpyAsync(&((*this)[0]), &(vec[0]), this->size() * sizeof(T), cudaMemcpyDeviceToHost, memoryStream);
544 cudaError_t err = cudaGetLastError();
545 if (err != cudaSuccess)
546 {
547 fprintf(stderr, "CUDA error in host_vector::asyncCopy:\n %s\n", cudaGetErrorString(err));
548 abort();
549 }
550 #endif
551 }
552
size()553 inline size_t size() const { return current_size; }
554
reserve(size_t new_size)555 void reserve(size_t new_size)
556 {
557 if (new_size <= capacity)
558 return;
559 T* new_data;
560 // QMCPACK often does repeated resizes like 256->257 then 257->258.
561 // this anticipates the further resizes by pre-allocating an additional
562 // 5% above what was requested.
563 new_size = 1.05 * new_size;
564 cudaHostAlloc((void**)&new_data, new_size * sizeof(T), 0);
565 if (data != NULL)
566 {
567 cudaMemcpy(new_data, data, current_size * sizeof(T), cudaMemcpyHostToHost);
568 cudaFreeHost(data);
569 data = NULL;
570 }
571 data = new_data;
572 capacity = new_size;
573 }
574
575 T& operator[](const int n) { return data[n]; }
576 const T& operator[](const int n) const
577 {
578 const T& a = data[n];
579 return a;
580 }
581
resize(size_t new_size)582 inline void resize(size_t new_size)
583 {
584 if (new_size <= current_size)
585 {
586 current_size = new_size;
587 if (new_size == 0)
588 {
589 clear();
590 }
591 return;
592 }
593 reserve(new_size);
594 for (int i = current_size; i < new_size; ++i)
595 data[i] = T();
596 current_size = new_size;
597 }
598
clear()599 inline void clear()
600 {
601 if (data != NULL)
602 {
603 //cudaFreeHost(data);
604 //data = NULL;
605 current_size = 0;
606 //capacity = 0;
607 }
608 }
609
push_back(const T & x)610 inline void push_back(const T& x)
611 {
612 if (current_size < capacity)
613 {
614 data[current_size] = x;
615 ++current_size;
616 return;
617 }
618 reserve(2 * capacity + 1);
619 push_back(x);
620 }
621 };
622
623 template<typename T>
device_vector(const host_vector<T> & vec)624 device_vector<T>::device_vector(const host_vector<T>& vec)
625 : data_pointer(NULL), current_size(0), alloc_size(0), own_data(true), managedmem(false)
626 {
627 this->resize(vec.size());
628 #ifdef QMC_CUDA
629 cudaMemcpy(&((*this)[0]), &(vec[0]), this->size() * sizeof(T), cudaMemcpyDeviceToHost);
630 cudaError_t err = cudaGetLastError();
631 if (err != cudaSuccess)
632 {
633 fprintf(stderr, "CUDA error in host_vector::operator=() for %s:\n %s\n", name.c_str(), cudaGetErrorString(err));
634 abort();
635 }
636 #endif
637 }
638
639 template<typename T>
640 class device_host_vector
641 {
642 public:
643 host_vector<T> CPU;
644 device_vector<T> GPU;
645
device_host_vector()646 device_host_vector() {}
647
device_host_vector(size_t size)648 device_host_vector(size_t size) : CPU(size), GPU(size) {}
649
resize(size_t size)650 inline void resize(size_t size)
651 {
652 CPU.resize(size);
653 GPU.resize(size);
654 }
655
host_to_device()656 inline void host_to_device() { GPU = CPU; }
657
device_to_host()658 inline void device_to_host() { CPU = GPU; }
659
660 inline T operator[](size_t i) const { return CPU[i]; }
661
662 inline T& operator[](size_t i) { return CPU[i]; }
663
fill(T val)664 inline void fill(T val)
665 {
666 std::fill(CPU.begin(), CPU.end(), val);
667 host_to_device();
668 }
669
gpu_data()670 inline T* gpu_data() { return GPU.data(); }
671 };
672 } // namespace gpu
673
674 #endif
675