1 // Copyright (C) 2015  Davis E. King (davis@dlib.net)
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 #ifndef DLIB_DNN_CuDNN_CPP_
4 #define DLIB_DNN_CuDNN_CPP_
5 
6 #ifdef DLIB_USE_CUDA
7 
8 #include "cudnn_dlibapi.h"
9 #include "tensor.h"
10 #include <cudnn.h>
11 #include <tuple>
12 #include <map>
13 #include <iostream>
14 #include <string>
15 #include <vector>
16 #include "cuda_utils.h"
17 #include "cpu_dlib.h"
18 #include "cuda_dlib.h"
19 #include "tensor_tools.h"
20 
cudnn_get_error_string(cudnnStatus_t s)21 static const char* cudnn_get_error_string(cudnnStatus_t s)
22 {
23     switch(s)
24     {
25         case CUDNN_STATUS_NOT_INITIALIZED:
26             return "CUDA Runtime API initialization failed.";
27         case CUDNN_STATUS_ALLOC_FAILED:
28             return "CUDA Resources could not be allocated.";
29         case CUDNN_STATUS_BAD_PARAM:
30             return "CUDNN_STATUS_BAD_PARAM";
31         case CUDNN_STATUS_EXECUTION_FAILED:
32             return "CUDNN_STATUS_EXECUTION_FAILED";
33         case CUDNN_STATUS_NOT_SUPPORTED:
34             return "CUDNN_STATUS_NOT_SUPPORTED";
35         case CUDNN_STATUS_ARCH_MISMATCH:
36             return "CUDNN_STATUS_ARCH_MISMATCH: Your GPU is too old and not supported by cuDNN";
37         default:
38             return "A call to cuDNN failed";
39     }
40 }
41 
42 // Check the return value of a call to the cuDNN runtime for an error condition.
43 #define CHECK_CUDNN(call)                                                      \
44 do{                                                                              \
45     const cudnnStatus_t error = call;                                         \
46     if (error != CUDNN_STATUS_SUCCESS)                                        \
47     {                                                                          \
48         std::ostringstream sout;                                               \
49         sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
50         sout << "code: " << error << ", reason: " << cudnn_get_error_string(error);\
51         throw dlib::cudnn_error(sout.str());                            \
52     }                                                                          \
53 }while(false)
54 
55 
56 namespace dlib
57 {
58 
59     namespace cuda
60     {
61 
62     // ------------------------------------------------------------------------------------
63 
descriptor(const tensor & t)64         static cudnnTensorDescriptor_t descriptor(const tensor& t)
65         {
66             return (const cudnnTensorDescriptor_t)t.get_cudnn_tensor_descriptor().get_handle();
67         }
descriptor(const tensor_descriptor & t)68         static cudnnTensorDescriptor_t descriptor(const tensor_descriptor& t)
69         {
70             return (const cudnnTensorDescriptor_t)t.get_handle();
71         }
72 
73     // ------------------------------------------------------------------------------------
74 
75         class cudnn_context
76         {
77         public:
78             // not copyable
79             cudnn_context(const cudnn_context&) = delete;
80             cudnn_context& operator=(const cudnn_context&) = delete;
81 
cudnn_context()82             cudnn_context()
83             {
84                 handles.resize(16);
85             }
~cudnn_context()86             ~cudnn_context()
87             {
88                 for (auto h : handles)
89                 {
90                     if (h)
91                         cudnnDestroy(h);
92                 }
93             }
94 
get_handle()95             cudnnHandle_t get_handle (
96             )
97             {
98                 int new_device_id;
99                 CHECK_CUDA(cudaGetDevice(&new_device_id));
100                 // make room for more devices if needed
101                 if (new_device_id >= (long)handles.size())
102                     handles.resize(new_device_id+16);
103 
104                 // If we don't have a handle already for this device then make one
105                 if (!handles[new_device_id])
106                     CHECK_CUDNN(cudnnCreate(&handles[new_device_id]));
107 
108                 // Finally, return the handle for the current device
109                 return handles[new_device_id];
110             }
111 
112         private:
113 
114             std::vector<cudnnHandle_t> handles;
115         };
116 
context()117         static cudnnHandle_t context()
118         {
119             thread_local cudnn_context c;
120             return c.get_handle();
121         }
122     // ------------------------------------------------------------------------------------
123 
124         class cudnn_activation_descriptor
125         {
126         public:
127             // not copyable
128             cudnn_activation_descriptor(const cudnn_activation_descriptor&) = delete;
129             cudnn_activation_descriptor& operator=(const cudnn_activation_descriptor&) = delete;
130 
cudnn_activation_descriptor(cudnnActivationMode_t mode,cudnnNanPropagation_t reluNanOpt,double reluCeiling)131             cudnn_activation_descriptor(
132                 cudnnActivationMode_t mode,
133                 cudnnNanPropagation_t reluNanOpt,
134                 double reluCeiling
135             )
136             {
137                 CHECK_CUDNN(cudnnCreateActivationDescriptor(&handle));
138                 CHECK_CUDNN(cudnnSetActivationDescriptor(handle, mode, reluNanOpt, reluCeiling));
139             }
140 
~cudnn_activation_descriptor()141             ~cudnn_activation_descriptor()
142             {
143                 cudnnDestroyActivationDescriptor(handle);
144             }
145 
get_handle()146             cudnnActivationDescriptor_t get_handle (
147             )
148             {
149                 return handle;
150             }
151         private:
152             cudnnActivationDescriptor_t handle;
153         };
154 
relu_activation_descriptor()155         static cudnnActivationDescriptor_t relu_activation_descriptor()
156         {
157             thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN,0);
158             return des.get_handle();
159         }
160 
sigmoid_activation_descriptor()161         static cudnnActivationDescriptor_t sigmoid_activation_descriptor()
162         {
163             thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_SIGMOID, CUDNN_PROPAGATE_NAN,0);
164             return des.get_handle();
165         }
166 
tanh_activation_descriptor()167         static cudnnActivationDescriptor_t tanh_activation_descriptor()
168         {
169             thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN,0);
170             return des.get_handle();
171         }
172 
173     // ------------------------------------------------------------------------------------
174 
175         tensor_descriptor::
tensor_descriptor()176         tensor_descriptor(
177         ) : handle(nullptr)
178         {
179         }
180 
181         tensor_descriptor::
~tensor_descriptor()182         ~tensor_descriptor()
183         {
184             set_size(0,0,0,0);
185         }
186 
187         void tensor_descriptor::
set_size(int n,int k,int nr,int nc)188         set_size(
189             int n,
190             int k,
191             int nr,
192             int nc
193         )
194         {
195             if (handle)
196             {
197                 cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)handle);
198                 handle = nullptr;
199             }
200 
201             if (n != 0 && nr != 0 && nc != 0 && k != 0)
202             {
203                 cudnnTensorDescriptor_t h;
204                 CHECK_CUDNN(cudnnCreateTensorDescriptor(&h));
205                 handle = h;
206 
207                 CHECK_CUDNN(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
208                         CUDNN_TENSOR_NCHW,
209                         CUDNN_DATA_FLOAT,
210                         n,
211                         k,
212                         nr,
213                         nc));
214             }
215         }
216 
217         void tensor_descriptor::
get_size(int & n,int & k,int & nr,int & nc) const218         get_size (
219             int& n,
220             int& k,
221             int& nr,
222             int& nc
223         ) const
224         {
225             if (handle)
226             {
227                 int nStride, cStride, hStride, wStride;
228                 cudnnDataType_t datatype;
229                 CHECK_CUDNN(cudnnGetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
230                         &datatype,
231                         &n,
232                         &k,
233                         &nr,
234                         &nc,
235                         &nStride,
236                         &cStride,
237                         &hStride,
238                         &wStride));
239             }
240             else
241             {
242                 n = 0;
243                 k = 0;
244                 nr = 0;
245                 nc = 0;
246             }
247         }
248 
249     // ------------------------------------------------------------------------------------
250 
add(float beta,tensor & dest,float alpha,const tensor & src)251         void add(
252             float beta,
253             tensor& dest,
254             float alpha,
255             const tensor& src
256         )
257         {
258             DLIB_CASSERT(
259                   (have_same_dimensions(src, dest) ||
260                   (src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1) ||
261                   (src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
262                   (src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
263                   (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)) &&
264                   is_same_object(src,dest) == false ,
265                     "\n\t dest.num_samples(): " << dest.num_samples()
266                     <<"\n\t dest.k():           " << dest.k()
267                     <<"\n\t dest.nr():          " << dest.nr()
268                     <<"\n\t dest.nc():          " << dest.nc()
269                     <<"\n\t src.num_samples():  " << src.num_samples()
270                     <<"\n\t src.k():            " << src.k()
271                     <<"\n\t src.nr():           " << src.nr()
272                     <<"\n\t src.nc():           " << src.nc()
273                     );
274 
275             if (dest.size() == src.size() && beta == 1)
276             {
277                 // Call the dlib function in this case since it's faster than the one that
278                 // comes with cuDNN (at least as of cuDNN v4).
279                 add_scaled(dest, alpha, src);
280                 return;
281             }
282             else if (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)
283             {
284                 add_cv_to_all_columns(beta, dest, alpha, src);
285                 return;
286             }
287 
288             CHECK_CUDNN(cudnnAddTensor(context(),
289                                     &alpha,
290                                     descriptor(src),
291                                     src.device(),
292                                     &beta,
293                                     descriptor(dest),
294                                     dest.device()));
295         }
296 
assign_conv_bias_gradient(tensor & grad,const tensor & gradient_input)297         void assign_conv_bias_gradient (
298             tensor& grad,
299             const tensor& gradient_input
300         )
301         {
302             DLIB_CASSERT(
303                   grad.num_samples() == 1 &&
304                   grad.k()  >= 1 &&
305                   grad.nr() == 1 &&
306                   grad.nc() == 1 &&
307                   gradient_input.k() == grad.k() &&
308                   gradient_input.size() > 0 &&
309                   is_same_object(grad,gradient_input) == false
310                   );
311 
312             const float alpha = 1;
313             const float beta = 0;
314             CHECK_CUDNN(cudnnConvolutionBackwardBias(context(),
315                                                &alpha,
316                                                descriptor(gradient_input),
317                                                gradient_input.device(),
318                                                &beta,
319                                                descriptor(grad),
320                                                grad.device()));
321         }
322 
323     // ------------------------------------------------------------------------------------
324 
batch_normalize_inference(const double eps,resizable_tensor & dest,const tensor & src,const tensor & gamma,const tensor & beta,const tensor & running_means,const tensor & running_variances)325         void batch_normalize_inference (
326             const double eps,
327             resizable_tensor& dest,
328             const tensor& src,
329             const tensor& gamma,
330             const tensor& beta,
331             const tensor& running_means,
332             const tensor& running_variances
333         )
334         {
335             DLIB_CASSERT(
336                 gamma.num_samples() == 1 &&
337                 gamma.nr() == src.nr() &&
338                 gamma.nc() == src.nc() &&
339                 gamma.k()  == src.k() &&
340                 have_same_dimensions(gamma, beta) &&
341                 have_same_dimensions(gamma, running_means) &&
342                 have_same_dimensions(gamma, running_variances) &&
343                 eps > 0,
344                 "\ngamma.num_samples(): " << gamma.num_samples() <<
345                 "\ngamma.k():  " << gamma.k() <<
346                 "\ngamma.nr(): " << gamma.nr() <<
347                 "\ngamma.nc(): " << gamma.nc() <<
348                 "\nbeta.num_samples(): " << beta.num_samples() <<
349                 "\nbeta.k():   " << beta.k() <<
350                 "\nbeta.nr():  " << beta.nr() <<
351                 "\nbeta.nc():  " << beta.nc() <<
352                 "\nrunning_means.num_samples(): " << running_means.num_samples() <<
353                 "\nrunning_means.k():   " << running_means.k() <<
354                 "\nrunning_means.nr():  " << running_means.nr() <<
355                 "\nrunning_means.nc():  " << running_means.nc() <<
356                 "\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
357                 "\nrunning_variances.k():   " << running_variances.k() <<
358                 "\nrunning_variances.nr():  " << running_variances.nr() <<
359                 "\nrunning_variances.nc():  " << running_variances.nc() <<
360                 "\nsrc.k():   " << src.k() <<
361                 "\nsrc.nr():  " << src.nr() <<
362                 "\nsrc.nc():  " << src.nc() <<
363                 "\neps:  " << eps
364             );
365             const float in_scale = 1;
366             const float out_scale = 0;
367 
368             dest.copy_size(src);
369 
370             CHECK_CUDNN(cudnnBatchNormalizationForwardInference(
371                                 context(),
372                                 CUDNN_BATCHNORM_PER_ACTIVATION,
373                                 &in_scale,
374                                 &out_scale,
375                                 descriptor(src),
376                                 src.device(),
377                                 descriptor(dest),
378                                 dest.device(),
379                                 descriptor(gamma),
380                                 gamma.device(),
381                                 beta.device(),
382                                 running_means.device(),
383                                 running_variances.device(),
384                                 eps));
385         }
386 
batch_normalize(const double eps,resizable_tensor & dest,resizable_tensor & means,resizable_tensor & invstds,const double averaging_factor,resizable_tensor & running_means,resizable_tensor & running_variances,const tensor & src,const tensor & gamma,const tensor & beta)387         void batch_normalize (
388             const double eps,
389             resizable_tensor& dest,
390             resizable_tensor& means,
391             resizable_tensor& invstds,
392             const double averaging_factor,
393             resizable_tensor& running_means,
394             resizable_tensor& running_variances,
395             const tensor& src,
396             const tensor& gamma,
397             const tensor& beta
398         )
399         {
400             DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
401             DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
402             DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
403             DLIB_CASSERT(
404                 src.num_samples() > 1 &&
405                 gamma.num_samples() == 1 &&
406                 beta.num_samples() == 1 &&
407                 gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
408                 gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
409                 gamma.k()  == beta.k()  && beta.k() == src.k() &&
410                 eps > 0,
411                 "\ngamma.num_samples(): " << gamma.num_samples() <<
412                 "\ngamma.k():  " << gamma.k() <<
413                 "\ngamma.nr(): " << gamma.nr() <<
414                 "\ngamma.nc(): " << gamma.nc() <<
415                 "\nbeta.num_samples(): " << beta.num_samples() <<
416                 "\nbeta.k():   " << beta.k() <<
417                 "\nbeta.nr():  " << beta.nr() <<
418                 "\nbeta.nc():  " << beta.nc() <<
419                 "\nsrc.k():   " << src.k() <<
420                 "\nsrc.nr():  " << src.nr() <<
421                 "\nsrc.nc():  " << src.nc() <<
422                 "\neps:  " << eps
423             );
424 
425             const float in_scale = 1;
426             const float out_scale = 0;
427 
428             dest.copy_size(src);
429             means.set_size(1, src.k(), src.nr(), src.nc());
430             invstds.copy_size(means);
431             running_means.copy_size(means);
432             running_variances.copy_size(means);
433             // cuDNN requires that running_means and running_variances be initialized to
434             // some valid float values even if the averaging factor would have ignored
435             // them.
436             if (averaging_factor == 1)
437             {
438                 running_means = 0;
439                 running_variances = 1;
440             }
441 
442             CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
443                                 context(),
444                                 CUDNN_BATCHNORM_PER_ACTIVATION,
445                                 &in_scale,
446                                 &out_scale,
447                                 descriptor(src),
448                                 src.device(),
449                                 descriptor(dest),
450                                 dest.device(),
451                                 descriptor(gamma),
452                                 gamma.device(),
453                                 beta.device(),
454                                 averaging_factor,
455                                 running_means.device(),
456                                 running_variances.device(),
457                                 eps,
458                                 means.device(),
459                                 invstds.device()));
460         }
461 
batch_normalize_gradient(const double eps,const tensor & gradient_input,const tensor & means,const tensor & invstds,const tensor & src,const tensor & gamma,tensor & src_grad,tensor & gamma_grad,tensor & beta_grad)462         void batch_normalize_gradient(
463             const double eps,
464             const tensor& gradient_input,
465             const tensor& means,
466             const tensor& invstds,
467             const tensor& src,
468             const tensor& gamma,
469             tensor& src_grad,
470             tensor& gamma_grad,
471             tensor& beta_grad
472         )
473         {
474             const long num = src.k()*src.nr()*src.nc();
475             DLIB_CASSERT(src.num_samples() > 1);
476             DLIB_CASSERT(num == (long)means.size());
477             DLIB_CASSERT(num == (long)invstds.size());
478             DLIB_CASSERT(num == (long)gamma.size());
479             DLIB_CASSERT(num == (long)gamma_grad.size());
480             DLIB_CASSERT(num == (long)beta_grad.size());
481             DLIB_CASSERT(have_same_dimensions(gradient_input, src));
482             DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
483             DLIB_CASSERT(eps > 0);
484 
485             const float in_scale = 1;
486             const float out_scale = 1;
487             const float in_scale_params = 1;
488             const float out_scale_params = 0;
489 
490             CHECK_CUDNN(cudnnBatchNormalizationBackward(
491                                 context(),
492                                 CUDNN_BATCHNORM_PER_ACTIVATION,
493                                 &in_scale,
494                                 &out_scale,
495                                 &in_scale_params,
496                                 &out_scale_params,
497                                 descriptor(src),
498                                 src.device(),
499                                 descriptor(gradient_input),
500                                 gradient_input.device(),
501                                 descriptor(src_grad),
502                                 src_grad.device(),
503                                 descriptor(gamma),
504                                 gamma.device(),
505                                 gamma_grad.device(),
506                                 beta_grad.device(),
507                                 eps,
508                                 means.device(),
509                                 invstds.device()));
510         }
511 
512     // ------------------------------------------------------------------------------------
513 
batch_normalize_conv_inference(const double eps,resizable_tensor & dest,const tensor & src,const tensor & gamma,const tensor & beta,const tensor & running_means,const tensor & running_variances)514         void batch_normalize_conv_inference (
515             const double eps,
516             resizable_tensor& dest,
517             const tensor& src,
518             const tensor& gamma,
519             const tensor& beta,
520             const tensor& running_means,
521             const tensor& running_variances
522         )
523         {
524             DLIB_CASSERT(
525                 gamma.num_samples() == 1 &&
526                 gamma.nr() == 1 &&
527                 gamma.nc() == 1 &&
528                 gamma.k()  == src.k() &&
529                 have_same_dimensions(gamma, beta) &&
530                 have_same_dimensions(gamma, running_means) &&
531                 have_same_dimensions(gamma, running_variances) &&
532                 eps > 0,
533                 "\ngamma.num_samples(): " << gamma.num_samples() <<
534                 "\ngamma.k():  " << gamma.k() <<
535                 "\ngamma.nr(): " << gamma.nr() <<
536                 "\ngamma.nc(): " << gamma.nc() <<
537                 "\nbeta.num_samples(): " << beta.num_samples() <<
538                 "\nbeta.k():   " << beta.k() <<
539                 "\nbeta.nr():  " << beta.nr() <<
540                 "\nbeta.nc():  " << beta.nc() <<
541                 "\nrunning_means.num_samples(): " << running_means.num_samples() <<
542                 "\nrunning_means.k():   " << running_means.k() <<
543                 "\nrunning_means.nr():  " << running_means.nr() <<
544                 "\nrunning_means.nc():  " << running_means.nc() <<
545                 "\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
546                 "\nrunning_variances.k():   " << running_variances.k() <<
547                 "\nrunning_variances.nr():  " << running_variances.nr() <<
548                 "\nrunning_variances.nc():  " << running_variances.nc() <<
549                 "\nsrc.k():   " << src.k() <<
550                 "\nsrc.nr():  " << src.nr() <<
551                 "\nsrc.nc():  " << src.nc() <<
552                 "\neps:  " << eps
553             );
554             const float in_scale = 1;
555             const float out_scale = 0;
556 
557             dest.copy_size(src);
558 
559             CHECK_CUDNN(cudnnBatchNormalizationForwardInference(
560                                 context(),
561                                 CUDNN_BATCHNORM_SPATIAL,
562                                 &in_scale,
563                                 &out_scale,
564                                 descriptor(src),
565                                 src.device(),
566                                 descriptor(dest),
567                                 dest.device(),
568                                 descriptor(gamma),
569                                 gamma.device(),
570                                 beta.device(),
571                                 running_means.device(),
572                                 running_variances.device(),
573                                 eps));
574         }
575 
batch_normalize_conv(const double eps,resizable_tensor & dest,resizable_tensor & means,resizable_tensor & invstds,const double averaging_factor,resizable_tensor & running_means,resizable_tensor & running_variances,const tensor & src,const tensor & gamma,const tensor & beta)576         void batch_normalize_conv (
577             const double eps,
578             resizable_tensor& dest,
579             resizable_tensor& means,
580             resizable_tensor& invstds,
581             const double averaging_factor,
582             resizable_tensor& running_means,
583             resizable_tensor& running_variances,
584             const tensor& src,
585             const tensor& gamma,
586             const tensor& beta
587         )
588         {
589             DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
590             DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
591             DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
592             DLIB_CASSERT(
593                 src.num_samples() > 1 &&
594                 gamma.num_samples() == 1 &&
595                 beta.num_samples() == 1 &&
596                 gamma.nr() == 1 &&
597                 beta.nr() == 1 &&
598                 gamma.nc() == 1 &&
599                 beta.nc() == 1 &&
600                 gamma.k()  == beta.k()  && beta.k() == src.k() &&
601                 eps > 0,
602                 "\ngamma.num_samples(): " << gamma.num_samples() <<
603                 "\ngamma.k():  " << gamma.k() <<
604                 "\ngamma.nr(): " << gamma.nr() <<
605                 "\ngamma.nc(): " << gamma.nc() <<
606                 "\nbeta.num_samples(): " << beta.num_samples() <<
607                 "\nbeta.k():   " << beta.k() <<
608                 "\nbeta.nr():  " << beta.nr() <<
609                 "\nbeta.nc():  " << beta.nc() <<
610                 "\nsrc.k():   " << src.k() <<
611                 "\nsrc.nr():  " << src.nr() <<
612                 "\nsrc.nc():  " << src.nc() <<
613                 "\neps:  " << eps
614             );
615             const float in_scale = 1;
616             const float out_scale = 0;
617 
618             dest.copy_size(src);
619             means.set_size(1, src.k());
620             invstds.copy_size(means);
621             running_means.copy_size(means);
622             running_variances.copy_size(means);
623             // cuDNN requires that running_means and running_variances be initialized to
624             // some valid float values even if the averaging factor would have ignored
625             // them.
626             if (averaging_factor == 1)
627             {
628                 running_means = 0;
629                 running_variances = 1;
630             }
631 
632             CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
633                                 context(),
634                                 CUDNN_BATCHNORM_SPATIAL,
635                                 &in_scale,
636                                 &out_scale,
637                                 descriptor(src),
638                                 src.device(),
639                                 descriptor(dest),
640                                 dest.device(),
641                                 descriptor(gamma),
642                                 gamma.device(),
643                                 beta.device(),
644                                 averaging_factor,
645                                 running_means.device(),
646                                 running_variances.device(),
647                                 eps,
648                                 means.device(),
649                                 invstds.device()));
650         }
651 
batch_normalize_conv_gradient(const double eps,const tensor & gradient_input,const tensor & means,const tensor & invstds,const tensor & src,const tensor & gamma,tensor & src_grad,tensor & gamma_grad,tensor & beta_grad)652         void batch_normalize_conv_gradient(
653             const double eps,
654             const tensor& gradient_input,
655             const tensor& means,
656             const tensor& invstds,
657             const tensor& src,
658             const tensor& gamma,
659             tensor& src_grad,
660             tensor& gamma_grad,
661             tensor& beta_grad
662         )
663         {
664             DLIB_CASSERT(src.k() == (long)means.size());
665             DLIB_CASSERT(src.k() == (long)invstds.size());
666             DLIB_CASSERT(src.k() == (long)gamma.size());
667             DLIB_CASSERT(src.k() == (long)gamma_grad.size());
668             DLIB_CASSERT(src.k() == (long)beta_grad.size());
669             DLIB_CASSERT(have_same_dimensions(gradient_input, src));
670             DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
671             DLIB_CASSERT(eps > 0);
672 
673             const float in_scale = 1;
674             const float out_scale = 1;
675             const float in_scale_params = 1;
676             const float out_scale_params = 0;
677 
678             CHECK_CUDNN(cudnnBatchNormalizationBackward(
679                                 context(),
680                                 CUDNN_BATCHNORM_SPATIAL,
681                                 &in_scale,
682                                 &out_scale,
683                                 &in_scale_params,
684                                 &out_scale_params,
685                                 descriptor(src),
686                                 src.device(),
687                                 descriptor(gradient_input),
688                                 gradient_input.device(),
689                                 descriptor(src_grad),
690                                 src_grad.device(),
691                                 descriptor(gamma),
692                                 gamma.device(),
693                                 gamma_grad.device(),
694                                 beta_grad.device(),
695                                 eps,
696                                 means.device(),
697                                 invstds.device()));
698         }
699 
700     // ------------------------------------------------------------------------------------
701     // ------------------------------------------------------------------------------------
702 
703         tensor_conv::
tensor_conv()704         tensor_conv(
705         ) :
706             filter_handle(nullptr),
707             conv_handle(nullptr),
708             forward_algo(0),
709             backward_data_algo(0),
710             backward_filters_algo(0)
711         {
712             clear();
713         }
714 
715         void tensor_conv::
clear()716         clear (
717         )
718         {
719             if (filter_handle)
720                 cudnnDestroyFilterDescriptor((cudnnFilterDescriptor_t)filter_handle);
721             if (conv_handle)
722                 cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor_t)conv_handle);
723             filter_handle = nullptr;
724             conv_handle = nullptr;
725             out_num_samples = 0;
726             out_k = 0;
727             out_nr = 0;
728             out_nc = 0;
729 
730             stride_y = 0;
731             stride_x = 0;
732             padding_y = 0;
733             padding_x = 0;
734             data_num_samples = 0;
735             data_k = 0;
736             data_nr = 0;
737             data_nc = 0;
738             filters_num_samples = 0;
739             filters_k = 0;
740             filters_nr = 0;
741             filters_nc = 0;
742 
743             forward_algo = 0;
744             backward_data_algo = 0;
745             backward_filters_algo = 0;
746 
747             forward_workspace_size_in_bytes = 0;
748             backward_data_workspace_size_in_bytes = 0;
749             backward_filters_workspace_size_in_bytes = 0;
750 
751             forward_workspace.reset();
752             backward_data_workspace.reset();
753             backward_filters_workspace.reset();
754         }
755 
756         // Given an array of cudnn algorithm performance results, like
757         // cudnnConvolutionFwdAlgoPerf_t, pick the best one to use.
758         template <typename T>
pick_best_algorithm(const std::vector<T> & perf_results)759         decltype(std::declval<T>().algo) pick_best_algorithm(const std::vector<T> &perf_results)
760         {
761             DLIB_CASSERT(!perf_results.empty());
762             CHECK_CUDNN(perf_results[0].status);
763             if (dnn_prefer_fastest_algorithms())
764                 return perf_results[0].algo;
765 
766             // Otherwise we find the algorithm that has a good status and uses the least amount
767             // of memory.
768             size_t best_memory = std::numeric_limits<size_t>::max();
769             decltype(std::declval<T>().algo) best_alg;
770             for (auto&& perf : perf_results)
771             {
772                 if (perf.status == CUDNN_STATUS_SUCCESS && perf.memory < best_memory)
773                 {
774                     best_memory = perf.memory;
775                     best_alg = perf.algo;
776                 }
777             }
778             return best_alg;
779         }
780 
781         void tensor_conv::
select_best_algorithms(const tensor & data,const tensor_descriptor & dest_desc)782         select_best_algorithms (
783             const tensor& data,
784             const tensor_descriptor& dest_desc
785         )
786         {
787             // Calling the cuDNN "find the best algorithm" functions are really slow.  So we keep a
788             // cache that tells us what method was best for a particular configuration.
789             thread_local std::map<std::tuple<int,int,int,int,long,long>,
790                                   std::tuple<int,int,int>> config_to_algo_cache;
791 
792             // If we have already found good algorithms for this setting then just pull them from
793             // the cache.
794             const auto cache_key = std::make_tuple(stride_y, stride_x, padding_y, padding_x, filters_nr, filters_nc);
795             const auto iter = config_to_algo_cache.find(cache_key);
796             if (iter != config_to_algo_cache.end())
797             {
798                 std::tie(forward_algo, backward_data_algo, backward_filters_algo) = iter->second;
799                 return;
800             }
801 
802 
803             // Pick which forward algorithm we will use and allocate the necessary
804             // workspace buffer.
805             cudnnConvolutionFwdAlgo_t forward_best_algo;
806 #if CUDNN_MAJOR >= 8
807             {
808                 int num_possible_algorithms = 0;
809                 CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(context(), &num_possible_algorithms));
810                 std::vector<cudnnConvolutionFwdAlgoPerf_t> perf_results(num_possible_algorithms);
811                 int num_algorithms = 0;
812                 CHECK_CUDNN(cudnnFindConvolutionForwardAlgorithm(
813                         context(),
814                         descriptor(data),
815                         (const cudnnFilterDescriptor_t)filter_handle,
816                         (const cudnnConvolutionDescriptor_t)conv_handle,
817                         descriptor(dest_desc),
818                         num_possible_algorithms,
819                         &num_algorithms,
820                         perf_results.data()));
821                 perf_results.resize(num_algorithms);
822                 forward_best_algo = pick_best_algorithm(perf_results);
823             }
824 #else
825             CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(
826                     context(),
827                     descriptor(data),
828                     (const cudnnFilterDescriptor_t)filter_handle,
829                     (const cudnnConvolutionDescriptor_t)conv_handle,
830                     descriptor(dest_desc),
831                     dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_FWD_PREFER_FASTEST:CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
832                     std::numeric_limits<size_t>::max(),
833                     &forward_best_algo));
834 #endif
835             forward_algo = forward_best_algo;
836 
837 
838 
839             // Pick which backward data algorithm we will use and allocate the
840             // necessary workspace buffer.
841             cudnnConvolutionBwdDataAlgo_t backward_data_best_algo;
842 #if CUDNN_MAJOR >= 8
843             {
844                 int num_possible_algorithms = 0;
845                 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(context(), &num_possible_algorithms));
846                 std::vector<cudnnConvolutionBwdDataAlgoPerf_t> perf_results(num_possible_algorithms);
847                 int num_algorithms = 0;
848                 CHECK_CUDNN(cudnnFindConvolutionBackwardDataAlgorithm(
849                         context(),
850                         (const cudnnFilterDescriptor_t)filter_handle,
851                         descriptor(dest_desc),
852                         (const cudnnConvolutionDescriptor_t)conv_handle,
853                         descriptor(data),
854                         num_possible_algorithms,
855                         &num_algorithms,
856                         perf_results.data()));
857                 perf_results.resize(num_algorithms);
858                 backward_data_best_algo = pick_best_algorithm(perf_results);
859             }
860 #else
861             CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(
862                     context(),
863                     (const cudnnFilterDescriptor_t)filter_handle,
864                     descriptor(dest_desc),
865                     (const cudnnConvolutionDescriptor_t)conv_handle,
866                     descriptor(data),
867                     dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE,
868                     std::numeric_limits<size_t>::max(),
869                     &backward_data_best_algo));
870 #endif
871             backward_data_algo = backward_data_best_algo;
872 
873 
874 
875 
876             // Pick which backward filters algorithm we will use and allocate the
877             // necessary workspace buffer.
878             cudnnConvolutionBwdFilterAlgo_t backward_filters_best_algo;
879 #if CUDNN_MAJOR >= 8
880             {
881                 int num_possible_algorithms = 0;
882                 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(context(), &num_possible_algorithms));
883                 std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> perf_results(num_possible_algorithms);
884                 int num_algorithms = 0;
885                 CHECK_CUDNN(cudnnFindConvolutionBackwardFilterAlgorithm(
886                         context(),
887                         descriptor(data),
888                         descriptor(dest_desc),
889                         (const cudnnConvolutionDescriptor_t)conv_handle,
890                         (const cudnnFilterDescriptor_t)filter_handle,
891                         num_possible_algorithms,
892                         &num_algorithms,
893                         perf_results.data()));
894                 perf_results.resize(num_algorithms);
895                 backward_filters_best_algo = pick_best_algorithm(perf_results);
896             }
897 #else
898             CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(
899                     context(),
900                     descriptor(data),
901                     descriptor(dest_desc),
902                     (const cudnnConvolutionDescriptor_t)conv_handle,
903                     (const cudnnFilterDescriptor_t)filter_handle,
904                     dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE,
905                     std::numeric_limits<size_t>::max(),
906                     &backward_filters_best_algo));
907 #endif
908 
909             // cuDNN 5.1 has a bug that causes
910             // cudnnGetConvolutionBackwardFilterAlgorithm() to pick the winograd
911             // algorithm even for cases where cuDNN doesn't support it, leading to
912             // incorrect outputs.  So here we check if we are in a case where winograd
913             // isn't supported and manually overrule
914             // cudnnGetConvolutionBackwardFilterAlgorithm() by picking a safe
915             // algorithm.
916             if (dnn_prefer_fastest_algorithms() &&
917                 !(stride_x == 1 && stride_y == 1 && ((filters_nr==3&&filters_nc==3) || (filters_nr==5&&filters_nc==5)))
918             )
919             {
920                 backward_filters_best_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
921             }
922             backward_filters_algo = backward_filters_best_algo;
923 
924 
925             // Save this algorithm selection in the cache
926             config_to_algo_cache[cache_key] = std::make_tuple(forward_algo, backward_data_algo, backward_filters_algo);
927         }
928 
929         void tensor_conv::
setup(const tensor & data,const tensor & filters,int stride_y_,int stride_x_,int padding_y_,int padding_x_)930         setup(
931             const tensor& data,
932             const tensor& filters,
933             int stride_y_,
934             int stride_x_,
935             int padding_y_,
936             int padding_x_
937         )
938         {
939             DLIB_CASSERT(data.k() == filters.k());
940 
941             // if the last call to setup gave the same exact settings then don't do
942             // anything.
943             if (data_num_samples == data.num_samples() &&
944                 data_k == data.k() &&
945                 data_nr == data.nr() &&
946                 data_nc == data.nc() &&
947                 stride_y_ == stride_y &&
948                 stride_x_ == stride_x &&
949                 padding_y_ == padding_y &&
950                 padding_x_ == padding_x &&
951                 filters_num_samples == filters.num_samples() &&
952                 filters_k == filters.k() &&
953                 filters_nr == filters.nr() &&
954                 filters_nc == filters.nc()
955             )
956             {
957                 return;
958             }
959 
960             clear();
961             try
962             {
963                 stride_y = stride_y_;
964                 stride_x = stride_x_;
965                 padding_y = padding_y_;
966                 padding_x = padding_x_;
967                 data_num_samples = data.num_samples();
968                 data_k = data.k();
969                 data_nr = data.nr();
970                 data_nc = data.nc();
971                 filters_num_samples = filters.num_samples();
972                 filters_k = filters.k();
973                 filters_nr = filters.nr();
974                 filters_nc = filters.nc();
975 
976                 CHECK_CUDNN(cudnnCreateFilterDescriptor((cudnnFilterDescriptor_t*)&filter_handle));
977                 CHECK_CUDNN(cudnnSetFilter4dDescriptor((cudnnFilterDescriptor_t)filter_handle,
978                                                  CUDNN_DATA_FLOAT,
979                                                  CUDNN_TENSOR_NCHW,
980                                                  filters.num_samples(),
981                                                  filters.k(),
982                                                  filters.nr(),
983                                                  filters.nc()));
984 
985                 CHECK_CUDNN(cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&conv_handle));
986 #if CUDNN_MAJOR >= 6
987                 CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
988                         padding_y, // vertical padding
989                         padding_x, // horizontal padding
990                         stride_y,
991                         stride_x,
992                         1, 1, // must be 1,1
993                         CUDNN_CROSS_CORRELATION,
994                         CUDNN_DATA_FLOAT)); // could also be CUDNN_CONVOLUTION
995 #else
996                 CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
997                         padding_y, // vertical padding
998                         padding_x, // horizontal padding
999                         stride_y,
1000                         stride_x,
1001                         1, 1, // must be 1,1
1002                         CUDNN_CROSS_CORRELATION)); // could also be CUDNN_CONVOLUTION
1003 #endif
1004 
1005                 CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim(
1006                         (const cudnnConvolutionDescriptor_t)conv_handle,
1007                         descriptor(data),
1008                         (const cudnnFilterDescriptor_t)filter_handle,
1009                         &out_num_samples,
1010                         &out_k,
1011                         &out_nr,
1012                         &out_nc));
1013 
1014                 tensor_descriptor dest_desc;
1015                 dest_desc.set_size(out_num_samples,out_k,out_nr,out_nc);
1016 
1017                 select_best_algorithms(data, dest_desc);
1018 
1019                 CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(
1020                         context(),
1021                         descriptor(data),
1022                         (const cudnnFilterDescriptor_t)filter_handle,
1023                         (const cudnnConvolutionDescriptor_t)conv_handle,
1024                         descriptor(dest_desc),
1025                         (cudnnConvolutionFwdAlgo_t)forward_algo,
1026                         &forward_workspace_size_in_bytes));
1027 
1028 
1029                 CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(
1030                         context(),
1031                         (const cudnnFilterDescriptor_t)filter_handle,
1032                         descriptor(dest_desc),
1033                         (const cudnnConvolutionDescriptor_t)conv_handle,
1034                         descriptor(data),
1035                         (cudnnConvolutionBwdDataAlgo_t)backward_data_algo,
1036                         &backward_data_workspace_size_in_bytes));
1037 
1038 
1039                 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(
1040                         context(),
1041                         descriptor(data),
1042                         descriptor(dest_desc),
1043                         (const cudnnConvolutionDescriptor_t)conv_handle,
1044                         (const cudnnFilterDescriptor_t)filter_handle,
1045                         (cudnnConvolutionBwdFilterAlgo_t)backward_filters_algo,
1046                         &backward_filters_workspace_size_in_bytes));
1047             }
1048             catch(...)
1049             {
1050                 clear();
1051                 throw;
1052             }
1053         }
1054 
1055         tensor_conv::
~tensor_conv()1056         ~tensor_conv (
1057         )
1058         {
1059             clear();
1060         }
1061 
operator ()(const bool add_to_output,resizable_tensor & output,const tensor & data,const tensor & filters)1062         void tensor_conv::operator() (
1063             const bool add_to_output,
1064             resizable_tensor& output,
1065             const tensor& data,
1066             const tensor& filters
1067         )
1068         {
1069             DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
1070 
1071             output.set_size(out_num_samples, out_k, out_nr, out_nc);
1072             (*this)(add_to_output, static_cast<tensor&>(output), data, filters);
1073         }
1074 
operator ()(const bool add_to_output,tensor & output,const tensor & data,const tensor & filters)1075         void tensor_conv::operator() (
1076             const bool add_to_output,
1077             tensor& output,
1078             const tensor& data,
1079             const tensor& filters
1080         )
1081         {
1082             DLIB_CASSERT(is_same_object(output,data) == false);
1083             DLIB_CASSERT(is_same_object(output,filters) == false);
1084             DLIB_CASSERT(filters.k() == data.k());
1085             DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
1086             DLIB_CASSERT(filters.nc() <= data.nc() + 2*padding_x,
1087                 "Filter windows must be small enough to fit into the padded image."
1088                 << "\n\t filters.nc(): " << filters.nc()
1089                 << "\n\t data.nc():  " << data.nc()
1090                 << "\n\t padding_x: " << padding_x
1091                 );
1092             DLIB_CASSERT(filters.nr() <= data.nr() + 2*padding_y,
1093                 "Filter windows must be small enough to fit into the padded image."
1094                 << "\n\t filters.nr(): " << filters.nr()
1095                 << "\n\t data.nr():  " << data.nr()
1096                 << "\n\t padding_y: " << padding_y
1097                 );
1098 
1099 
1100             DLIB_CASSERT(output.num_samples() == data.num_samples(),out_num_samples << "  " << data.num_samples());
1101             DLIB_CASSERT(output.k() == filters.num_samples());
1102             DLIB_CASSERT(output.nr() == 1+(data.nr()+2*padding_y-filters.nr())/stride_y);
1103             DLIB_CASSERT(output.nc() == 1+(data.nc()+2*padding_x-filters.nc())/stride_x);
1104 
1105 
1106 
1107             const float alpha = 1;
1108             const float beta = add_to_output ? 1 : 0;
1109 
1110             // Since cudnnConvolutionForward() is an asynchronous call, we need to hold a
1111             // reference to the workspace buffer so we can be sure it isn't reallocated
1112             // while the function is still executing on the device.  But each time we come
1113             // here, we make sure to grab the latest workspace buffer so that, globally, we
1114             // minimize the number of such buffers.
1115             forward_workspace = device_global_buffer(forward_workspace_size_in_bytes);
1116 
1117             CHECK_CUDNN(cudnnConvolutionForward(
1118                     context(),
1119                     &alpha,
1120                     descriptor(data),
1121                     data.device(),
1122                     (const cudnnFilterDescriptor_t)filter_handle,
1123                     filters.device(),
1124                     (const cudnnConvolutionDescriptor_t)conv_handle,
1125                     (cudnnConvolutionFwdAlgo_t)forward_algo,
1126                     forward_workspace,
1127                     forward_workspace_size_in_bytes,
1128                     &beta,
1129                     descriptor(output),
1130                     output.device()));
1131         }
1132 
get_gradient_for_data(const bool add_to_output,const tensor & gradient_input,const tensor & filters,tensor & data_gradient)1133         void tensor_conv::get_gradient_for_data (
1134             const bool add_to_output,
1135             const tensor& gradient_input,
1136             const tensor& filters,
1137             tensor& data_gradient
1138         )
1139         {
1140             const float alpha = 1;
1141             const float beta = add_to_output ? 1 : 0;
1142 
1143             // Since cudnnConvolutionBackwardData() is an asynchronous call, we need to hold a
1144             // reference to the workspace buffer so we can be sure it isn't reallocated
1145             // while the function is still executing on the device.  But each time we come
1146             // here, we make sure to grab the latest workspace buffer so that, globally, we
1147             // minimize the number of such buffers.
1148             backward_data_workspace = device_global_buffer(backward_data_workspace_size_in_bytes);
1149 
1150 
1151             CHECK_CUDNN(cudnnConvolutionBackwardData(context(),
1152                                                   &alpha,
1153                                                   (const cudnnFilterDescriptor_t)filter_handle,
1154                                                   filters.device(),
1155                                                   descriptor(gradient_input),
1156                                                   gradient_input.device(),
1157                                                   (const cudnnConvolutionDescriptor_t)conv_handle,
1158                                                   (cudnnConvolutionBwdDataAlgo_t)backward_data_algo,
1159                                                   backward_data_workspace,
1160                                                   backward_data_workspace_size_in_bytes,
1161                                                   &beta,
1162                                                   descriptor(data_gradient),
1163                                                   data_gradient.device()));
1164         }
1165 
1166         void tensor_conv::
get_gradient_for_filters(const bool add_to_output,const tensor & gradient_input,const tensor & data,tensor & filters_gradient)1167         get_gradient_for_filters (
1168             const bool add_to_output,
1169             const tensor& gradient_input,
1170             const tensor& data,
1171             tensor& filters_gradient
1172         )
1173         {
1174             const float alpha = 1;
1175             const float beta = add_to_output ? 1 : 0;
1176 
1177             // Since cudnnConvolutionBackwardFilter() is an asynchronous call, we need to hold a
1178             // reference to the workspace buffer so we can be sure it isn't reallocated
1179             // while the function is still executing on the device.  But each time we come
1180             // here, we make sure to grab the latest workspace buffer so that, globally, we
1181             // minimize the number of such buffers.
1182             backward_filters_workspace = device_global_buffer(backward_filters_workspace_size_in_bytes);
1183 
1184             CHECK_CUDNN(cudnnConvolutionBackwardFilter(context(),
1185                                                     &alpha,
1186                                                     descriptor(data),
1187                                                     data.device(),
1188                                                     descriptor(gradient_input),
1189                                                     gradient_input.device(),
1190                                                     (const cudnnConvolutionDescriptor_t)conv_handle,
1191                                                     (cudnnConvolutionBwdFilterAlgo_t)backward_filters_algo,
1192                                                     backward_filters_workspace,
1193                                                     backward_filters_workspace_size_in_bytes,
1194                                                     &beta,
1195                                                     (const cudnnFilterDescriptor_t)filter_handle,
1196                                                     filters_gradient.device()));
1197         }
1198 
1199     // ------------------------------------------------------------------------------------
1200     // ------------------------------------------------------------------------------------
1201 
pooling()1202         pooling::pooling (
1203         ) : handle(nullptr),window_height(0),window_width(0),stride_y(0),stride_x(0),padding_y(0), padding_x(0)
1204         {
1205         }
1206 
~pooling()1207         pooling::~pooling(
1208         )
1209         {
1210             clear();
1211         }
1212 
1213         void pooling::
clear()1214         clear(
1215         )
1216         {
1217             if (handle)
1218                 cudnnDestroyPoolingDescriptor((cudnnPoolingDescriptor_t)handle);
1219             handle = nullptr;
1220             window_height = 0;
1221             window_width = 0;
1222             stride_y = 0;
1223             stride_x = 0;
1224             padding_y = 0;
1225             padding_x = 0;
1226         }
1227 
1228         void pooling::
setup_max_pooling(int window_height_,int window_width_,int stride_y_,int stride_x_,int padding_y_,int padding_x_)1229         setup_max_pooling(
1230             int window_height_,
1231             int window_width_,
1232             int stride_y_,
1233             int stride_x_,
1234             int padding_y_,
1235             int padding_x_
1236         )
1237         {
1238             setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_MAX);
1239             do_max_pooling = true;
1240         }
1241 
1242         void pooling::
setup_avg_pooling(int window_height_,int window_width_,int stride_y_,int stride_x_,int padding_y_,int padding_x_)1243         setup_avg_pooling(
1244             int window_height_,
1245             int window_width_,
1246             int stride_y_,
1247             int stride_x_,
1248             int padding_y_,
1249             int padding_x_
1250         )
1251         {
1252             setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING);
1253             do_max_pooling = false;
1254         }
1255 
1256         void pooling::
setup(int window_height_,int window_width_,int stride_y_,int stride_x_,int padding_y_,int padding_x_,int pooling_mode)1257         setup(
1258             int window_height_,
1259             int window_width_,
1260             int stride_y_,
1261             int stride_x_,
1262             int padding_y_,
1263             int padding_x_,
1264             int pooling_mode
1265         )
1266         {
1267             DLIB_CASSERT (window_height_ > 0 && window_width_ > 0 &&
1268                           stride_y_ > 0 && stride_x_ > 0 ,
1269                           "window_height_: " << window_height_
1270                           << "\t\n window_width_: " << window_width_
1271                           << "\t\n stride_y_: " << stride_y_
1272                           << "\t\n stride_x_: " << stride_x_ );
1273             DLIB_CASSERT( 0 <= padding_y_ && padding_y_ < window_height_ &&
1274                           0 <= padding_x_ && padding_x_ < window_width_,
1275                           "window_height_: " << window_height_
1276                           << "\t\n window_width_: " << window_width_
1277                           << "\t\n padding_y_: " << padding_y_
1278                           << "\t\n padding_x_: " << padding_x_ );
1279 
1280             if (window_height == window_height_ &&
1281                 window_width  == window_width_ &&
1282                 stride_y == stride_y_ &&
1283                 stride_x == stride_x_ &&
1284                 padding_y == padding_y_ &&
1285                 padding_x == padding_x_
1286                 )
1287             {
1288                 return;
1289             }
1290 
1291             clear();
1292             try
1293             {
1294                 window_height = window_height_;
1295                 window_width = window_width_;
1296                 stride_x = stride_x_;
1297                 stride_y = stride_y_;
1298                 padding_y  = padding_y_;
1299                 padding_x  = padding_x_;
1300                 cudnnPoolingDescriptor_t poolingDesc;
1301                 CHECK_CUDNN(cudnnCreatePoolingDescriptor(&poolingDesc));
1302                 handle = poolingDesc;
1303 
1304                 CHECK_CUDNN(cudnnSetPooling2dDescriptor(poolingDesc,
1305                                                 (cudnnPoolingMode_t)pooling_mode,
1306                                                 CUDNN_PROPAGATE_NAN,
1307                                                 window_height,
1308                                                 window_width,
1309                                                 padding_y,
1310                                                 padding_x,
1311                                                 stride_y,
1312                                                 stride_x));
1313             }
1314             catch(...)
1315             {
1316                 clear();
1317                 throw;
1318             }
1319         }
1320 
1321         void pooling::
operator ()(resizable_tensor & dest,const tensor & src)1322         operator() (
1323             resizable_tensor& dest,
1324             const tensor& src
1325         )
1326         {
1327             DLIB_CASSERT(window_width  <= src.nc() + 2*padding_x,
1328                 "Pooling windows must be small enough to fit into the padded image."
1329                 << "\n\t window_width: " << window_width
1330                 << "\n\t src.nc():  " << src.nc()
1331                 << "\n\t padding_x: " << padding_x
1332                 );
1333             DLIB_CASSERT(window_height <= src.nr() + 2*padding_y,
1334                 "Pooling windows must be small enough to fit into the padded image."
1335                 << "\n\t window_height: " << window_height
1336                 << "\n\t src.nr():  " << src.nr()
1337                 << "\n\t padding_y: " << padding_y
1338                 );
1339             const float alpha = 1;
1340             const float beta = 0;
1341             int outN;
1342             int outC;
1343             int outH;
1344             int outW;
1345             CHECK_CUDNN(cudnnGetPooling2dForwardOutputDim((const cudnnPoolingDescriptor_t)handle,
1346                                                     descriptor(src),
1347                                                     &outN,
1348                                                     &outC,
1349                                                     &outH,
1350                                                     &outW));
1351 
1352 
1353             dest.set_size(outN,outC,outH,outW);
1354 
1355             DLIB_CASSERT(dest.num_samples() == src.num_samples());
1356             DLIB_CASSERT(dest.k() == src.k());
1357             DLIB_CASSERT(dest.nr() == 1 + (src.nr() + 2*padding_y - window_height)/stride_y,
1358                 "\n stride_y:  " << stride_y  <<
1359                 "\n padding_y: " << padding_y  <<
1360                 "\n window_height: " << window_height  <<
1361                 "\n src.nr(): " << src.nr()  <<
1362                 "\n dest.nr(): " << dest.nr()  <<
1363                 "\n src.nr()/stride_y: " <<  src.nr()/stride_y);
1364             DLIB_CASSERT(dest.nc() == 1 + (src.nc() + 2*padding_x - window_width)/stride_x,
1365                 "\n stride_x:  " << stride_x  <<
1366                 "\n padding_x: " << padding_x  <<
1367                 "\n window_width: " << window_width  <<
1368                 "\n src.nc(): " << src.nc()  <<
1369                 "\n dest.nc(): " << dest.nc()  <<
1370                 "\n src.nc()/stride_x: " <<  src.nc()/stride_x);
1371 
1372             CHECK_CUDNN(cudnnPoolingForward(context(),
1373                                      (const cudnnPoolingDescriptor_t)handle,
1374                                      &alpha,
1375                                      descriptor(src),
1376                                      src.device(),
1377                                      &beta,
1378                                      descriptor(dest),
1379                                      dest.device()));
1380         }
1381 
get_gradient(const tensor & gradient_input,const tensor & dest,const tensor & src,tensor & grad)1382         void pooling::get_gradient(
1383             const tensor& gradient_input,
1384             const tensor& dest,
1385             const tensor& src,
1386             tensor& grad
1387         )
1388         {
1389             DLIB_CASSERT(have_same_dimensions(gradient_input,dest));
1390             DLIB_CASSERT(have_same_dimensions(src,grad));
1391 
1392             const float alpha = 1;
1393             const float beta = 1;
1394             CHECK_CUDNN(cudnnPoolingBackward(context(),
1395                                        (const cudnnPoolingDescriptor_t)handle,
1396                                        &alpha,
1397                                        descriptor(dest),
1398                                        dest.device(),
1399                                        descriptor(gradient_input),
1400                                        gradient_input.device(),
1401                                        descriptor(src),
1402                                        src.device(),
1403                                        &beta,
1404                                        descriptor(grad),
1405                                        grad.device()));
1406         }
1407 
1408     // ------------------------------------------------------------------------------------
1409     // ------------------------------------------------------------------------------------
1410 
softmax(tensor & dest,const tensor & src)1411         void softmax (
1412             tensor& dest,
1413             const tensor& src
1414         )
1415         {
1416             DLIB_CASSERT(have_same_dimensions(dest,src));
1417             if (src.size() == 0)
1418                 return;
1419 
1420             const float alpha = 1;
1421             const float beta = 0;
1422 
1423             CHECK_CUDNN(cudnnSoftmaxForward(context(),
1424                                       CUDNN_SOFTMAX_ACCURATE,
1425                                       CUDNN_SOFTMAX_MODE_CHANNEL,
1426                                       &alpha,
1427                                       descriptor(src),
1428                                       src.device(),
1429                                       &beta,
1430                                       descriptor(dest),
1431                                       dest.device()));
1432         }
1433 
1434 
softmax_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1435         void softmax_gradient (
1436             tensor& grad,
1437             const tensor& dest,
1438             const tensor& gradient_input
1439         )
1440         {
1441             DLIB_CASSERT(
1442                   have_same_dimensions(dest,gradient_input) == true &&
1443                   have_same_dimensions(dest,grad) == true );
1444             if (dest.size() == 0)
1445                 return;
1446 
1447             const float alpha = 1;
1448             const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1449             CHECK_CUDNN(cudnnSoftmaxBackward(context(),
1450                                       CUDNN_SOFTMAX_ACCURATE,
1451                                       CUDNN_SOFTMAX_MODE_CHANNEL,
1452                                       &alpha,
1453                                       descriptor(dest),
1454                                       dest.device(),
1455                                       descriptor(gradient_input),
1456                                       gradient_input.device(),
1457                                       &beta,
1458                                       descriptor(grad),
1459                                       grad.device()));
1460         }
1461 
1462     // ------------------------------------------------------------------------------------
1463     // ------------------------------------------------------------------------------------
1464 
softmax_all(tensor & dest,const tensor & src)1465         void softmax_all (
1466             tensor& dest,
1467             const tensor& src
1468         )
1469         {
1470             DLIB_CASSERT(have_same_dimensions(dest,src));
1471             if (src.size() == 0)
1472                 return;
1473 
1474             const float alpha = 1;
1475             const float beta = 0;
1476 
1477             CHECK_CUDNN(cudnnSoftmaxForward(context(),
1478                                       CUDNN_SOFTMAX_ACCURATE,
1479                                       CUDNN_SOFTMAX_MODE_INSTANCE,
1480                                       &alpha,
1481                                       descriptor(src),
1482                                       src.device(),
1483                                       &beta,
1484                                       descriptor(dest),
1485                                       dest.device()));
1486         }
1487 
1488 
softmax_all_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1489         void softmax_all_gradient (
1490             tensor& grad,
1491             const tensor& dest,
1492             const tensor& gradient_input
1493         )
1494         {
1495             DLIB_CASSERT(
1496                   have_same_dimensions(dest,gradient_input) == true &&
1497                   have_same_dimensions(dest,grad) == true );
1498             if (dest.size() == 0)
1499                 return;
1500 
1501             const float alpha = 1;
1502             const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1503             CHECK_CUDNN(cudnnSoftmaxBackward(context(),
1504                                       CUDNN_SOFTMAX_ACCURATE,
1505                                       CUDNN_SOFTMAX_MODE_INSTANCE,
1506                                       &alpha,
1507                                       descriptor(dest),
1508                                       dest.device(),
1509                                       descriptor(gradient_input),
1510                                       gradient_input.device(),
1511                                       &beta,
1512                                       descriptor(grad),
1513                                       grad.device()));
1514         }
1515 
1516     // ------------------------------------------------------------------------------------
1517     // ------------------------------------------------------------------------------------
1518 
sigmoid(tensor & dest,const tensor & src)1519         void sigmoid (
1520             tensor& dest,
1521             const tensor& src
1522         )
1523         {
1524             DLIB_CASSERT(have_same_dimensions(dest,src));
1525             if (src.size() == 0)
1526                 return;
1527 
1528             const float alpha = 1;
1529             const float beta = 0;
1530             CHECK_CUDNN(cudnnActivationForward(context(),
1531                                          sigmoid_activation_descriptor(),
1532                                          &alpha,
1533                                          descriptor(src),
1534                                          src.device(),
1535                                          &beta,
1536                                          descriptor(dest),
1537                                          dest.device()));
1538         }
1539 
sigmoid_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1540         void sigmoid_gradient (
1541             tensor& grad,
1542             const tensor& dest,
1543             const tensor& gradient_input
1544         )
1545         {
1546             DLIB_CASSERT(
1547                   have_same_dimensions(dest,gradient_input) == true &&
1548                   have_same_dimensions(dest,grad) == true );
1549             if (dest.size() == 0)
1550                 return;
1551 
1552             const float alpha = 1;
1553             const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1554             CHECK_CUDNN(cudnnActivationBackward(context(),
1555                                           sigmoid_activation_descriptor(),
1556                                           &alpha,
1557                                           descriptor(dest),
1558                                           dest.device(),
1559                                           descriptor(gradient_input),
1560                                           gradient_input.device(),
1561                                           descriptor(dest),
1562                                           dest.device(),
1563                                           &beta,
1564                                           descriptor(grad),
1565                                           grad.device()));
1566         }
1567 
1568     // ------------------------------------------------------------------------------------
1569 
relu(tensor & dest,const tensor & src)1570         void relu (
1571             tensor& dest,
1572             const tensor& src
1573         )
1574         {
1575             DLIB_CASSERT(have_same_dimensions(dest,src));
1576             if (src.size() == 0)
1577                 return;
1578 
1579             const float alpha = 1;
1580             const float beta = 0;
1581             CHECK_CUDNN(cudnnActivationForward(context(),
1582                                          relu_activation_descriptor(),
1583                                          &alpha,
1584                                          descriptor(src),
1585                                          src.device(),
1586                                          &beta,
1587                                          descriptor(dest),
1588                                          dest.device()));
1589         }
1590 
relu_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1591         void relu_gradient (
1592             tensor& grad,
1593             const tensor& dest,
1594             const tensor& gradient_input
1595         )
1596         {
1597             DLIB_CASSERT(
1598                   have_same_dimensions(dest,gradient_input) == true &&
1599                   have_same_dimensions(dest,grad) == true );
1600             if (dest.size() == 0)
1601                 return;
1602 
1603             const float alpha = 1;
1604             const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1605             CHECK_CUDNN(cudnnActivationBackward(context(),
1606                                           relu_activation_descriptor(),
1607                                           &alpha,
1608                                           descriptor(dest),
1609                                           dest.device(),
1610                                           descriptor(gradient_input),
1611                                           gradient_input.device(),
1612                                           descriptor(dest),
1613                                           dest.device(),
1614                                           &beta,
1615                                           descriptor(grad),
1616                                           grad.device()));
1617         }
1618 
1619     // ------------------------------------------------------------------------------------
1620 
tanh(tensor & dest,const tensor & src)1621         void tanh (
1622             tensor& dest,
1623             const tensor& src
1624         )
1625         {
1626             DLIB_CASSERT(have_same_dimensions(dest,src));
1627             if (src.size() == 0)
1628                 return;
1629 
1630             const float alpha = 1;
1631             const float beta = 0;
1632             CHECK_CUDNN(cudnnActivationForward(context(),
1633                                          tanh_activation_descriptor(),
1634                                          &alpha,
1635                                          descriptor(src),
1636                                          src.device(),
1637                                          &beta,
1638                                          descriptor(dest),
1639                                          dest.device()));
1640         }
1641 
tanh_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1642         void tanh_gradient (
1643             tensor& grad,
1644             const tensor& dest,
1645             const tensor& gradient_input
1646         )
1647         {
1648             DLIB_CASSERT(
1649                   have_same_dimensions(dest,gradient_input) == true &&
1650                   have_same_dimensions(dest,grad) == true);
1651             if (dest.size() == 0)
1652                 return;
1653 
1654             const float alpha = 1;
1655             const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1656             CHECK_CUDNN(cudnnActivationBackward(context(),
1657                                           tanh_activation_descriptor(),
1658                                           &alpha,
1659                                           descriptor(dest),
1660                                           dest.device(),
1661                                           descriptor(gradient_input),
1662                                           gradient_input.device(),
1663                                           descriptor(dest),
1664                                           dest.device(),
1665                                           &beta,
1666                                           descriptor(grad),
1667                                           grad.device()));
1668         }
1669 
1670     // ------------------------------------------------------------------------------------
1671     }
1672 }
1673 
1674 #endif // DLIB_USE_CUDA
1675 
1676 #endif // DLIB_DNN_CuDNN_CPP_
1677 
1678 
1679