1 //////////////////////////////////////////////////////////////////////////////////////
2 // This file is distributed under the University of Illinois/NCSA Open Source License.
3 // See LICENSE file in top directory for details.
4 //
5 // Copyright (c) 2016 Jeongnim Kim and QMCPACK developers.
6 //
7 // File developed by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
8 //                    Jeremy McMinnis, jmcminis@gmail.com, University of Illinois at Urbana-Champaign
9 //                    Jeongnim Kim, jeongnim.kim@gmail.com, University of Illinois at Urbana-Champaign
10 //                    Ye Luo, yeluo@anl.gov, Argonne National Laboratory
11 //
12 // File created by: Ken Esler, kpesler@gmail.com, University of Illinois at Urbana-Champaign
13 //////////////////////////////////////////////////////////////////////////////////////
14 
15 
16 #ifndef GPU_VECTOR_H
17 #define GPU_VECTOR_H
18 
19 #include <malloc.h>
20 #include <iostream>
21 #include <string>
22 #include <map>
23 #include <vector>
24 #include <cmath>
25 #include <cstdlib>
26 #include <cstdio>
27 #include <algorithm>
28 
29 #ifdef QMC_CUDA
30 #include <cuda_runtime_api.h>
31 #include "gpu_misc.h"
32 #endif
33 
34 namespace gpu
35 {
36 struct gpu_mem_object
37 {
38   size_t num_objects;
39   size_t total_bytes;
gpu_mem_objectgpu_mem_object40   gpu_mem_object(size_t size) : num_objects(1), total_bytes(size) {}
gpu_mem_objectgpu_mem_object41   gpu_mem_object() : num_objects(0), total_bytes(0) {}
42 };
43 
44 
45 class cuda_memory_manager_type
46 {
47 private:
48   std::map<std::string, gpu_mem_object> gpu_mem_map;
49   std::map<void*, std::pair<std::string, size_t>> gpu_pointer_map;
50 
51 public:
52   void* allocate(size_t bytes, std::string name = "");
53   void* allocate_managed(size_t bytes, std::string name = "", unsigned int flags = cudaMemAttachGlobal);
54 
55   void deallocate(void* p);
56 
57   void report();
58 };
59 
60 extern cuda_memory_manager_type cuda_memory_manager;
61 
62 template<typename T>
63 class host_vector;
64 
65 template<typename T>
66 class device_vector
67 {
68 private:
69   T* data_pointer;
70   size_t current_size, alloc_size;
71   std::string name;
72   // True if the data was allocated by this vector.  False if we're
73   // referencing memory
74   bool own_data;
75   // True if managed memory was requested using resize function, starts out false
76   bool managedmem;
77   // Flags for managed memory creation (defaults to cudaMemAttachGlobal) that can be set with set_managed_flags function
78   unsigned int managed_flags;
79 
80 public:
81   typedef T* pointer;
82 
set_name(std::string n)83   void set_name(std::string n) { name = n; }
84 
set_managed_flags(unsigned int flags)85   void set_managed_flags(unsigned int flags) { managed_flags = flags; }
86 
device_vector()87   inline device_vector()
88       : data_pointer(NULL),
89         current_size(0),
90         alloc_size(0),
91         own_data(true),
92         managedmem(false),
93         managed_flags(cudaMemAttachGlobal)
94   {}
95 
device_vector(std::string myName)96   inline device_vector(std::string myName)
97       : data_pointer(NULL),
98         current_size(0),
99         alloc_size(0),
100         name(myName),
101         own_data(true),
102         managedmem(false),
103         managed_flags(cudaMemAttachGlobal)
104   {}
105 
device_vector(size_t size)106   inline device_vector(size_t size)
107       : data_pointer(NULL),
108         current_size(0),
109         alloc_size(0),
110         own_data(true),
111         managedmem(false),
112         managed_flags(cudaMemAttachGlobal)
113   {
114     resize(size);
115   }
116 
device_vector(std::string myName,size_t size)117   inline device_vector(std::string myName, size_t size)
118       : name(myName),
119         data_pointer(NULL),
120         current_size(0),
121         alloc_size(0),
122         own_data(true),
123         managedmem(false),
124         managed_flags(cudaMemAttachGlobal)
125   {
126     resize(size);
127   }
128 
129   inline device_vector(const host_vector<T>& vec);
130 
~device_vector()131   ~device_vector()
132   {
133     if (alloc_size > 0 && data_pointer && own_data)
134       cuda_memory_manager.deallocate(data_pointer);
135   }
136 
reference(T * p,size_t size)137   inline void reference(T* p, size_t size)
138   {
139     //       fprintf (stderr, "reference called for name=%s size=%ld ptr=%p\n",
140     // 	       name.c_str(), size, p);
141     if (own_data && alloc_size > 0)
142       cuda_memory_manager.deallocate(data_pointer);
143     data_pointer = p;
144     current_size = size;
145     alloc_size   = 0;
146     own_data     = false;
147     managedmem   = false;
148   }
149 
150 
151   inline T& operator[](size_t i) const { return data_pointer[i]; }
152 
153 
154   inline void resize(size_t size, double reserve_factor = 1.0, bool managed = false)
155   {
156     if (!size)
157     {
158       current_size = 0;
159       return;
160     }
161     size_t reserve_size = (size_t)std::ceil(reserve_factor * size);
162     size_t byte_size    = sizeof(T) * reserve_size;
163     bool error          = false;
164     if (managed != managedmem)
165     {
166       if (managedmem)
167       {
168         if (alloc_size > 0) // Only trigger error message if memory is allocated
169         {
170           fprintf(stderr, "device_vector.resize from managed (%p) ", data_pointer);
171           error = true;
172         }
173       }
174       else
175       {
176         if (alloc_size != 0)
177           fprintf(stderr, "device_vector.resize from non-managed to managed.\n");
178       }
179     }
180     if ((size > alloc_size) || (alloc_size == 0))
181     {
182       if (own_data && (alloc_size > 0))
183         cuda_memory_manager.deallocate(data_pointer);
184       if (managed)
185         data_pointer = (T*)cuda_memory_manager.allocate_managed(byte_size, name, managed_flags);
186       else
187         data_pointer = (T*)cuda_memory_manager.allocate(byte_size, name);
188       alloc_size = reserve_size;
189       own_data   = true;
190       managedmem = managed;
191     }
192     current_size = size;
193     if (error)
194       fprintf(stderr, "to non-managed (%p).\n", data_pointer);
195   }
196 
clear()197   inline void clear()
198   {
199     if (alloc_size)
200     {
201       cuda_memory_manager.deallocate(data_pointer);
202       data_pointer = NULL;
203       current_size = alloc_size = 0;
204     }
205   }
206 
size()207   inline size_t size() const { return current_size; }
208 
209 
210   inline device_vector& operator=(const device_vector<T>& vec)
211   {
212     if (this->size() != vec.size())
213     {
214       if (!own_data)
215       {
216         fprintf(stderr,
217                 "Assigning referenced GPU vector, but it has the "
218                 "wrong size.\n");
219         fprintf(stderr, "Name = %s.  This size = %ld, vec size = %ld\n", name.c_str(), size(), vec.size());
220         abort();
221       }
222       resize(vec.size(), 1.0, managedmem);
223     }
224 #ifdef QMC_CUDA
225     cudaMemcpyAsync(data_pointer, &(vec[0]), this->size() * sizeof(T), cudaMemcpyDeviceToDevice);
226     cudaError_t err = cudaGetLastError();
227     if (err != cudaSuccess)
228     {
229       fprintf(stderr, "CUDA error in device_vector::operator=(device_vector):\n  %s\n", cudaGetErrorString(err));
230       fprintf(stderr, "vec.size() = %ld\n", vec.size());
231       abort();
232     }
233 #endif
234     return *this;
235   }
236 
device_vector(const device_vector<T> & vec)237   device_vector(const device_vector<T>& vec)
238       : data_pointer(NULL),
239         current_size(0),
240         alloc_size(0),
241         name(vec.name),
242         own_data(true),
243         managedmem(vec.managedmem),
244         managed_flags(vec.managed_flags)
245   {
246     resize(vec.size(), 1.0, managedmem);
247     // fprintf(stderr, "device_vector copy constructor called, name=%s.\n",
248     // 	      name.c_str());
249 #ifdef QMC_CUDA
250     if (this->size() != 0)
251     {
252       cudaMemcpy(data_pointer, &(vec[0]), vec.size() * sizeof(T), cudaMemcpyDeviceToDevice);
253       cudaError_t err = cudaGetLastError();
254       if (err != cudaSuccess)
255       {
256         fprintf(stderr, "CUDA error in device_vector::copy constructor:\n  %s\n", cudaGetErrorString(err));
257         abort();
258       }
259     }
260 #endif
261   }
262 
263   device_vector& operator=(const std::vector<T, std::allocator<T>>& vec)
264   {
265     if (this->size() != vec.size())
266     {
267       if (!own_data)
268       {
269         fprintf(stderr,
270                 "Assigning referenced GPU vector, but it has the "
271                 "wrong size.\n");
272         // fprintf (stderr, "Name = %s.  This size = %ld, vec size = %ld\n",
273         // 	   name.c_str(), size(), vec.size());
274         abort();
275       }
276       resize(vec.size(), 1.0, managedmem);
277     }
278 #ifdef QMC_CUDA
279     cudaMemcpyAsync(data_pointer, &(vec[0]), this->size() * sizeof(T), cudaMemcpyHostToDevice);
280     cudaError_t err = cudaGetLastError();
281     if (err != cudaSuccess)
282     {
283       fprintf(stderr, "CUDA error in device_vector::operator (%p)=(std::vector %p):\n  %s\n", data_pointer, &(vec[0]),
284               cudaGetErrorString(err));
285       abort();
286     }
287 #endif
288     return *this;
289   }
290 
291   device_vector& operator=(const host_vector<T>& vec)
292   {
293     if (this->size() != vec.size())
294     {
295       if (!own_data)
296       {
297         fprintf(stderr,
298                 "Assigning referenced GPU vector, but it has the "
299                 "wrong size.\n");
300         fprintf(stderr, "Name = %s.  This size = %ld, vec size = %ld\n", name.c_str(), size(), vec.size());
301         abort();
302       }
303       resize(vec.size(), 1.0, managedmem);
304     }
305 #ifdef QMC_CUDA
306     cudaMemcpy(&((*this)[0]), &(vec[0]), vec.size() * sizeof(T), cudaMemcpyHostToDevice);
307     cudaError_t err = cudaGetLastError();
308     if (err != cudaSuccess)
309     {
310       fprintf(stderr, "In operator=, name=%s, size=%ld  vec.size()=%ld\n", name.c_str(), size(), vec.size());
311       fprintf(stderr, "this pointer = %p  vec pointer=%p\n", data_pointer, &(vec[0]));
312       fprintf(stderr, "CUDA error in device_vector::operator=(const host_vector<T> &vec) for %s:\n  %s\n", name.c_str(),
313               cudaGetErrorString(err));
314       abort();
315     }
316 #endif
317     return *this;
318   }
319 
asyncCopy(const host_vector<T> & vec)320   void asyncCopy(const host_vector<T>& vec)
321   {
322     if (this->size() != vec.size())
323     {
324       if (!own_data)
325       {
326         fprintf(stderr,
327                 "Assigning referenced GPU vector, but it has the "
328                 "wrong size.\n");
329         fprintf(stderr, "Name = %s.  This size = %ld, vec size = %ld\n", name.c_str(), size(), vec.size());
330         abort();
331       }
332       resize(vec.size(), 1.0, managedmem);
333     }
334 #ifdef QMC_CUDA
335     cudaMemcpyAsync(&((*this)[0]), &(vec[0]), vec.size() * sizeof(T), cudaMemcpyHostToDevice, kernelStream);
336     cudaError_t err = cudaGetLastError();
337     if (err != cudaSuccess)
338     {
339       fprintf(stderr, "In operator=, name=%s, size=%ld  vec.size()=%ld\n", name.c_str(), size(), vec.size());
340       fprintf(stderr, "this pointer = %p  vec pointer=%p\n", data_pointer, &(vec[0]));
341       fprintf(stderr, "CUDA error in device_vector::asyncCopy(const host_vector<T> &vec) for %s:\n  %s\n", name.c_str(),
342               cudaGetErrorString(err));
343       abort();
344     }
345 #endif
346   }
347 
asyncCopy(const T * vec_ptr,size_t len,size_t offset,size_t datalen)348   void asyncCopy(const T* vec_ptr, size_t len, size_t offset, size_t datalen)
349   {
350     if ((this->size() != len) || (this->size() < offset + datalen))
351     {
352       if (!own_data)
353       {
354         fprintf(stderr,
355                 "Assigning referenced GPU vector, but it has the "
356                 "wrong size.\n");
357         fprintf(stderr, "Name = %s.  This size = %ld, vec size = %ld\n", name.c_str(), size(), len);
358         abort();
359       }
360       if (len < offset + datalen)
361       {
362         fprintf(stderr, "Trying to write more than the length of the vector.\n");
363         fprintf(stderr, "Name = %s.  This size = %ld, vec size = %ld, needed length = %ld\n", name.c_str(), size(), len,
364                 offset + datalen);
365         abort();
366       }
367       resize(len);
368     }
369 #ifdef QMC_CUDA
370     cudaMemcpyAsync(&((*this)[offset]), vec_ptr, datalen * sizeof(T), cudaMemcpyHostToDevice, kernelStream);
371     cudaError_t err = cudaGetLastError();
372     if (err != cudaSuccess)
373     {
374       fprintf(stderr, "In operator=, name=%s, size=%ld  vec.size()=%ld\n", name.c_str(), size(), len);
375       fprintf(stderr, "this pointer = %p  vec pointer=%p\n", data_pointer, vec_ptr);
376       fprintf(stderr, "CUDA error in device_vector::asyncCopy(const T* vec_ptr, len, offset, datalen) for %s:\n  %s\n",
377               name.c_str(), cudaGetErrorString(err));
378       abort();
379     }
380 #endif
381   }
382 
asyncCopy(const std::vector<T,std::allocator<T>> & vec)383   void asyncCopy(const std::vector<T, std::allocator<T>>& vec)
384   {
385     if (this->size() != vec.size())
386     {
387       if (!own_data)
388       {
389         fprintf(stderr,
390                 "Assigning referenced GPU vector, but it has the "
391                 "wrong size.\n");
392         fprintf(stderr, "Name = %s.  This size = %ld, vec size = %ld\n", name.c_str(), size(), vec.size());
393         abort();
394       }
395       resize(vec.size(), 1.0, managedmem);
396     }
397 #ifdef QMC_CUDA
398     cudaMemcpyAsync(&((*this)[0]), &(vec[0]), vec.size() * sizeof(T), cudaMemcpyHostToDevice, kernelStream);
399     cudaError_t err = cudaGetLastError();
400     if (err != cudaSuccess)
401     {
402       fprintf(stderr, "In operator=, name=%s, size=%ld  vec.size()=%ld\n", name.c_str(), size(), vec.size());
403       fprintf(stderr, "this pointer = %p  vec pointer=%p\n", data_pointer, &(vec[0]));
404       fprintf(stderr,
405               "CUDA error in device_vector::asyncCopy(const std::vector<T, std::allocator<T> > &vec) for %s:\n  %s\n",
406               name.c_str(), cudaGetErrorString(err));
407       abort();
408     }
409 #endif
410   }
copyFromGPU(std::vector<T,std::allocator<T>> & vec)411   void copyFromGPU(std::vector<T, std::allocator<T>>& vec)
412   {
413     if (this->size() != vec.size())
414     {
415       vec.resize(size());
416     }
417 #ifdef QMC_CUDA
418     cudaMemcpy(&(vec[0]), &((*this)[0]), vec.size() * sizeof(T), cudaMemcpyDeviceToHost);
419     cudaError_t err = cudaGetLastError();
420     if (err != cudaSuccess)
421     {
422       fprintf(stderr, "In operator=, name=%s, size=%ld  vec.size()=%ld\n", name.c_str(), size(), vec.size());
423       fprintf(stderr, "this pointer = %p  vec pointer=%p\n", data_pointer, &(vec[0]));
424       fprintf(stderr,
425               "CUDA error in device_vector::copyFromGPU(std::vector<T, std::allocator<T> > &vec) for %s:\n  %s\n",
426               name.c_str(), cudaGetErrorString(err));
427       abort();
428     }
429 #endif
430   }
431 
432 
data()433   inline T* data() const { return data_pointer; }
434 };
435 
436 
437 template<typename T>
438 class host_vector
439 {
440 private:
441   T* data;
442   size_t current_size;
443   size_t capacity;
444 
445 public:
host_vector()446   host_vector()
447   {
448     data         = NULL;
449     current_size = 0;
450     capacity     = 0;
451   }
452 
host_vector(const host_vector<T> & vec)453   host_vector(const host_vector<T>& vec)
454   {
455     if (vec.size() != 0)
456     {
457       cudaHostAlloc((void**)&data, vec.size() * sizeof(T), 0);
458       cudaMemcpy(data, vec.data, vec.size() * sizeof(T), cudaMemcpyHostToHost);
459     }
460     else
461     {
462       data = NULL;
463     }
464     current_size = vec.size();
465     capacity     = current_size;
466   }
467 
host_vector(int size)468   host_vector(int size)
469   {
470     data         = NULL;
471     current_size = 0;
472     capacity     = 0;
473     resize(size);
474   }
475 
~host_vector()476   ~host_vector()
477   {
478     if (data)
479     {
480       cudaFreeHost(data);
481       data         = NULL;
482       current_size = 0;
483       capacity     = 0;
484     }
485   }
486 
host_vector(const device_vector<T> & vec)487   host_vector(const device_vector<T>& vec)
488   {
489     data         = NULL;
490     current_size = 0;
491     capacity     = 0;
492     resize(vec.size());
493 #ifdef QMC_CUDA
494     cudaMemcpy(&(data[0]), &(vec[0]), current_size * sizeof(T), cudaMemcpyDeviceToHost);
495     cudaError_t err = cudaGetLastError();
496     if (err != cudaSuccess)
497     {
498       fprintf(stderr, "CUDA error in host_vector::copy constructor():\n  %s\n", cudaGetErrorString(err));
499       abort();
500     }
501 #endif
502   }
503 
504 
505   host_vector& operator=(const host_vector<T>& vec)
506   {
507     if (this->size() != vec.size())
508       this->resize(vec.size());
509 #ifdef QMC_CUDA
510     cudaMemcpyAsync(&((*this)[0]), &(vec[0]), this->size() * sizeof(T), cudaMemcpyHostToDevice);
511     cudaError_t err = cudaGetLastError();
512     if (err != cudaSuccess)
513     {
514       fprintf(stderr, "CUDA error in host_vector::operator=(host_vector):\n  %s\n", cudaGetErrorString(err));
515       abort();
516     }
517 #endif
518     return *this;
519   }
520 
521   host_vector& operator=(const device_vector<T>& vec)
522   {
523     if (this->size() != vec.size())
524       this->resize(vec.size());
525 #ifdef QMC_CUDA
526     cudaMemcpy(&((*this)[0]), &(vec[0]), this->size() * sizeof(T), cudaMemcpyDeviceToHost);
527     cudaError_t err = cudaGetLastError();
528     if (err != cudaSuccess)
529     {
530       fprintf(stderr, "CUDA error in host_vector::operator=(device_vector %p):\n  %s\n", &(vec[0]),
531               cudaGetErrorString(err));
532       abort();
533     }
534 #endif
535     return *this;
536   }
537 
asyncCopy(const device_vector<T> & vec)538   void asyncCopy(const device_vector<T>& vec)
539   {
540     if (this->size() != vec.size())
541       resize(vec.size());
542 #ifdef QMC_CUDA
543     cudaMemcpyAsync(&((*this)[0]), &(vec[0]), this->size() * sizeof(T), cudaMemcpyDeviceToHost, memoryStream);
544     cudaError_t err = cudaGetLastError();
545     if (err != cudaSuccess)
546     {
547       fprintf(stderr, "CUDA error in host_vector::asyncCopy:\n  %s\n", cudaGetErrorString(err));
548       abort();
549     }
550 #endif
551   }
552 
size()553   inline size_t size() const { return current_size; }
554 
reserve(size_t new_size)555   void reserve(size_t new_size)
556   {
557     if (new_size <= capacity)
558       return;
559     T* new_data;
560     // QMCPACK often does repeated resizes like 256->257 then 257->258.
561     // this anticipates the further resizes by pre-allocating an additional
562     // 5% above what was requested.
563     new_size = 1.05 * new_size;
564     cudaHostAlloc((void**)&new_data, new_size * sizeof(T), 0);
565     if (data != NULL)
566     {
567       cudaMemcpy(new_data, data, current_size * sizeof(T), cudaMemcpyHostToHost);
568       cudaFreeHost(data);
569       data = NULL;
570     }
571     data     = new_data;
572     capacity = new_size;
573   }
574 
575   T& operator[](const int n) { return data[n]; }
576   const T& operator[](const int n) const
577   {
578     const T& a = data[n];
579     return a;
580   }
581 
resize(size_t new_size)582   inline void resize(size_t new_size)
583   {
584     if (new_size <= current_size)
585     {
586       current_size = new_size;
587       if (new_size == 0)
588       {
589         clear();
590       }
591       return;
592     }
593     reserve(new_size);
594     for (int i = current_size; i < new_size; ++i)
595       data[i] = T();
596     current_size = new_size;
597   }
598 
clear()599   inline void clear()
600   {
601     if (data != NULL)
602     {
603       //cudaFreeHost(data);
604       //data = NULL;
605       current_size = 0;
606       //capacity = 0;
607     }
608   }
609 
push_back(const T & x)610   inline void push_back(const T& x)
611   {
612     if (current_size < capacity)
613     {
614       data[current_size] = x;
615       ++current_size;
616       return;
617     }
618     reserve(2 * capacity + 1);
619     push_back(x);
620   }
621 };
622 
623 template<typename T>
device_vector(const host_vector<T> & vec)624 device_vector<T>::device_vector(const host_vector<T>& vec)
625     : data_pointer(NULL), current_size(0), alloc_size(0), own_data(true), managedmem(false)
626 {
627   this->resize(vec.size());
628 #ifdef QMC_CUDA
629   cudaMemcpy(&((*this)[0]), &(vec[0]), this->size() * sizeof(T), cudaMemcpyDeviceToHost);
630   cudaError_t err = cudaGetLastError();
631   if (err != cudaSuccess)
632   {
633     fprintf(stderr, "CUDA error in host_vector::operator=() for %s:\n  %s\n", name.c_str(), cudaGetErrorString(err));
634     abort();
635   }
636 #endif
637 }
638 
639 template<typename T>
640 class device_host_vector
641 {
642 public:
643   host_vector<T> CPU;
644   device_vector<T> GPU;
645 
device_host_vector()646   device_host_vector() {}
647 
device_host_vector(size_t size)648   device_host_vector(size_t size) : CPU(size), GPU(size) {}
649 
resize(size_t size)650   inline void resize(size_t size)
651   {
652     CPU.resize(size);
653     GPU.resize(size);
654   }
655 
host_to_device()656   inline void host_to_device() { GPU = CPU; }
657 
device_to_host()658   inline void device_to_host() { CPU = GPU; }
659 
660   inline T operator[](size_t i) const { return CPU[i]; }
661 
662   inline T& operator[](size_t i) { return CPU[i]; }
663 
fill(T val)664   inline void fill(T val)
665   {
666     std::fill(CPU.begin(), CPU.end(), val);
667     host_to_device();
668   }
669 
gpu_data()670   inline T* gpu_data() { return GPU.data(); }
671 };
672 } // namespace gpu
673 
674 #endif
675