1 // Copyright (C) 2015 Davis E. King (davis@dlib.net)
2 // License: Boost Software License See LICENSE.txt for the full license.
3 #ifndef DLIB_DNN_CuDNN_CPP_
4 #define DLIB_DNN_CuDNN_CPP_
5
6 #ifdef DLIB_USE_CUDA
7
8 #include "cudnn_dlibapi.h"
9 #include "tensor.h"
10 #include <cudnn.h>
11 #include <tuple>
12 #include <map>
13 #include <iostream>
14 #include <string>
15 #include <vector>
16 #include "cuda_utils.h"
17 #include "cpu_dlib.h"
18 #include "cuda_dlib.h"
19 #include "tensor_tools.h"
20
cudnn_get_error_string(cudnnStatus_t s)21 static const char* cudnn_get_error_string(cudnnStatus_t s)
22 {
23 switch(s)
24 {
25 case CUDNN_STATUS_NOT_INITIALIZED:
26 return "CUDA Runtime API initialization failed.";
27 case CUDNN_STATUS_ALLOC_FAILED:
28 return "CUDA Resources could not be allocated.";
29 case CUDNN_STATUS_BAD_PARAM:
30 return "CUDNN_STATUS_BAD_PARAM";
31 case CUDNN_STATUS_EXECUTION_FAILED:
32 return "CUDNN_STATUS_EXECUTION_FAILED";
33 case CUDNN_STATUS_NOT_SUPPORTED:
34 return "CUDNN_STATUS_NOT_SUPPORTED";
35 case CUDNN_STATUS_ARCH_MISMATCH:
36 return "CUDNN_STATUS_ARCH_MISMATCH: Your GPU is too old and not supported by cuDNN";
37 default:
38 return "A call to cuDNN failed";
39 }
40 }
41
42 // Check the return value of a call to the cuDNN runtime for an error condition.
43 #define CHECK_CUDNN(call) \
44 do{ \
45 const cudnnStatus_t error = call; \
46 if (error != CUDNN_STATUS_SUCCESS) \
47 { \
48 std::ostringstream sout; \
49 sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
50 sout << "code: " << error << ", reason: " << cudnn_get_error_string(error);\
51 throw dlib::cudnn_error(sout.str()); \
52 } \
53 }while(false)
54
55
56 namespace dlib
57 {
58
59 namespace cuda
60 {
61
62 // ------------------------------------------------------------------------------------
63
descriptor(const tensor & t)64 static cudnnTensorDescriptor_t descriptor(const tensor& t)
65 {
66 return (const cudnnTensorDescriptor_t)t.get_cudnn_tensor_descriptor().get_handle();
67 }
descriptor(const tensor_descriptor & t)68 static cudnnTensorDescriptor_t descriptor(const tensor_descriptor& t)
69 {
70 return (const cudnnTensorDescriptor_t)t.get_handle();
71 }
72
73 // ------------------------------------------------------------------------------------
74
75 class cudnn_context
76 {
77 public:
78 // not copyable
79 cudnn_context(const cudnn_context&) = delete;
80 cudnn_context& operator=(const cudnn_context&) = delete;
81
cudnn_context()82 cudnn_context()
83 {
84 handles.resize(16);
85 }
~cudnn_context()86 ~cudnn_context()
87 {
88 for (auto h : handles)
89 {
90 if (h)
91 cudnnDestroy(h);
92 }
93 }
94
get_handle()95 cudnnHandle_t get_handle (
96 )
97 {
98 int new_device_id;
99 CHECK_CUDA(cudaGetDevice(&new_device_id));
100 // make room for more devices if needed
101 if (new_device_id >= (long)handles.size())
102 handles.resize(new_device_id+16);
103
104 // If we don't have a handle already for this device then make one
105 if (!handles[new_device_id])
106 CHECK_CUDNN(cudnnCreate(&handles[new_device_id]));
107
108 // Finally, return the handle for the current device
109 return handles[new_device_id];
110 }
111
112 private:
113
114 std::vector<cudnnHandle_t> handles;
115 };
116
context()117 static cudnnHandle_t context()
118 {
119 thread_local cudnn_context c;
120 return c.get_handle();
121 }
122 // ------------------------------------------------------------------------------------
123
124 class cudnn_activation_descriptor
125 {
126 public:
127 // not copyable
128 cudnn_activation_descriptor(const cudnn_activation_descriptor&) = delete;
129 cudnn_activation_descriptor& operator=(const cudnn_activation_descriptor&) = delete;
130
cudnn_activation_descriptor(cudnnActivationMode_t mode,cudnnNanPropagation_t reluNanOpt,double reluCeiling)131 cudnn_activation_descriptor(
132 cudnnActivationMode_t mode,
133 cudnnNanPropagation_t reluNanOpt,
134 double reluCeiling
135 )
136 {
137 CHECK_CUDNN(cudnnCreateActivationDescriptor(&handle));
138 CHECK_CUDNN(cudnnSetActivationDescriptor(handle, mode, reluNanOpt, reluCeiling));
139 }
140
~cudnn_activation_descriptor()141 ~cudnn_activation_descriptor()
142 {
143 cudnnDestroyActivationDescriptor(handle);
144 }
145
get_handle()146 cudnnActivationDescriptor_t get_handle (
147 )
148 {
149 return handle;
150 }
151 private:
152 cudnnActivationDescriptor_t handle;
153 };
154
relu_activation_descriptor()155 static cudnnActivationDescriptor_t relu_activation_descriptor()
156 {
157 thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN,0);
158 return des.get_handle();
159 }
160
sigmoid_activation_descriptor()161 static cudnnActivationDescriptor_t sigmoid_activation_descriptor()
162 {
163 thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_SIGMOID, CUDNN_PROPAGATE_NAN,0);
164 return des.get_handle();
165 }
166
tanh_activation_descriptor()167 static cudnnActivationDescriptor_t tanh_activation_descriptor()
168 {
169 thread_local cudnn_activation_descriptor des(CUDNN_ACTIVATION_TANH, CUDNN_PROPAGATE_NAN,0);
170 return des.get_handle();
171 }
172
173 // ------------------------------------------------------------------------------------
174
175 tensor_descriptor::
tensor_descriptor()176 tensor_descriptor(
177 ) : handle(nullptr)
178 {
179 }
180
181 tensor_descriptor::
~tensor_descriptor()182 ~tensor_descriptor()
183 {
184 set_size(0,0,0,0);
185 }
186
187 void tensor_descriptor::
set_size(int n,int k,int nr,int nc)188 set_size(
189 int n,
190 int k,
191 int nr,
192 int nc
193 )
194 {
195 if (handle)
196 {
197 cudnnDestroyTensorDescriptor((cudnnTensorDescriptor_t)handle);
198 handle = nullptr;
199 }
200
201 if (n != 0 && nr != 0 && nc != 0 && k != 0)
202 {
203 cudnnTensorDescriptor_t h;
204 CHECK_CUDNN(cudnnCreateTensorDescriptor(&h));
205 handle = h;
206
207 CHECK_CUDNN(cudnnSetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
208 CUDNN_TENSOR_NCHW,
209 CUDNN_DATA_FLOAT,
210 n,
211 k,
212 nr,
213 nc));
214 }
215 }
216
217 void tensor_descriptor::
get_size(int & n,int & k,int & nr,int & nc) const218 get_size (
219 int& n,
220 int& k,
221 int& nr,
222 int& nc
223 ) const
224 {
225 if (handle)
226 {
227 int nStride, cStride, hStride, wStride;
228 cudnnDataType_t datatype;
229 CHECK_CUDNN(cudnnGetTensor4dDescriptor((cudnnTensorDescriptor_t)handle,
230 &datatype,
231 &n,
232 &k,
233 &nr,
234 &nc,
235 &nStride,
236 &cStride,
237 &hStride,
238 &wStride));
239 }
240 else
241 {
242 n = 0;
243 k = 0;
244 nr = 0;
245 nc = 0;
246 }
247 }
248
249 // ------------------------------------------------------------------------------------
250
add(float beta,tensor & dest,float alpha,const tensor & src)251 void add(
252 float beta,
253 tensor& dest,
254 float alpha,
255 const tensor& src
256 )
257 {
258 DLIB_CASSERT(
259 (have_same_dimensions(src, dest) ||
260 (src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1) ||
261 (src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
262 (src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()) ||
263 (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)) &&
264 is_same_object(src,dest) == false ,
265 "\n\t dest.num_samples(): " << dest.num_samples()
266 <<"\n\t dest.k(): " << dest.k()
267 <<"\n\t dest.nr(): " << dest.nr()
268 <<"\n\t dest.nc(): " << dest.nc()
269 <<"\n\t src.num_samples(): " << src.num_samples()
270 <<"\n\t src.k(): " << src.k()
271 <<"\n\t src.nr(): " << src.nr()
272 <<"\n\t src.nc(): " << src.nc()
273 );
274
275 if (dest.size() == src.size() && beta == 1)
276 {
277 // Call the dlib function in this case since it's faster than the one that
278 // comes with cuDNN (at least as of cuDNN v4).
279 add_scaled(dest, alpha, src);
280 return;
281 }
282 else if (src.num_samples()==dest.num_samples() && src.k()==1 && src.nr()==1 && src.nc()==1)
283 {
284 add_cv_to_all_columns(beta, dest, alpha, src);
285 return;
286 }
287
288 CHECK_CUDNN(cudnnAddTensor(context(),
289 &alpha,
290 descriptor(src),
291 src.device(),
292 &beta,
293 descriptor(dest),
294 dest.device()));
295 }
296
assign_conv_bias_gradient(tensor & grad,const tensor & gradient_input)297 void assign_conv_bias_gradient (
298 tensor& grad,
299 const tensor& gradient_input
300 )
301 {
302 DLIB_CASSERT(
303 grad.num_samples() == 1 &&
304 grad.k() >= 1 &&
305 grad.nr() == 1 &&
306 grad.nc() == 1 &&
307 gradient_input.k() == grad.k() &&
308 gradient_input.size() > 0 &&
309 is_same_object(grad,gradient_input) == false
310 );
311
312 const float alpha = 1;
313 const float beta = 0;
314 CHECK_CUDNN(cudnnConvolutionBackwardBias(context(),
315 &alpha,
316 descriptor(gradient_input),
317 gradient_input.device(),
318 &beta,
319 descriptor(grad),
320 grad.device()));
321 }
322
323 // ------------------------------------------------------------------------------------
324
batch_normalize_inference(const double eps,resizable_tensor & dest,const tensor & src,const tensor & gamma,const tensor & beta,const tensor & running_means,const tensor & running_variances)325 void batch_normalize_inference (
326 const double eps,
327 resizable_tensor& dest,
328 const tensor& src,
329 const tensor& gamma,
330 const tensor& beta,
331 const tensor& running_means,
332 const tensor& running_variances
333 )
334 {
335 DLIB_CASSERT(
336 gamma.num_samples() == 1 &&
337 gamma.nr() == src.nr() &&
338 gamma.nc() == src.nc() &&
339 gamma.k() == src.k() &&
340 have_same_dimensions(gamma, beta) &&
341 have_same_dimensions(gamma, running_means) &&
342 have_same_dimensions(gamma, running_variances) &&
343 eps > 0,
344 "\ngamma.num_samples(): " << gamma.num_samples() <<
345 "\ngamma.k(): " << gamma.k() <<
346 "\ngamma.nr(): " << gamma.nr() <<
347 "\ngamma.nc(): " << gamma.nc() <<
348 "\nbeta.num_samples(): " << beta.num_samples() <<
349 "\nbeta.k(): " << beta.k() <<
350 "\nbeta.nr(): " << beta.nr() <<
351 "\nbeta.nc(): " << beta.nc() <<
352 "\nrunning_means.num_samples(): " << running_means.num_samples() <<
353 "\nrunning_means.k(): " << running_means.k() <<
354 "\nrunning_means.nr(): " << running_means.nr() <<
355 "\nrunning_means.nc(): " << running_means.nc() <<
356 "\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
357 "\nrunning_variances.k(): " << running_variances.k() <<
358 "\nrunning_variances.nr(): " << running_variances.nr() <<
359 "\nrunning_variances.nc(): " << running_variances.nc() <<
360 "\nsrc.k(): " << src.k() <<
361 "\nsrc.nr(): " << src.nr() <<
362 "\nsrc.nc(): " << src.nc() <<
363 "\neps: " << eps
364 );
365 const float in_scale = 1;
366 const float out_scale = 0;
367
368 dest.copy_size(src);
369
370 CHECK_CUDNN(cudnnBatchNormalizationForwardInference(
371 context(),
372 CUDNN_BATCHNORM_PER_ACTIVATION,
373 &in_scale,
374 &out_scale,
375 descriptor(src),
376 src.device(),
377 descriptor(dest),
378 dest.device(),
379 descriptor(gamma),
380 gamma.device(),
381 beta.device(),
382 running_means.device(),
383 running_variances.device(),
384 eps));
385 }
386
batch_normalize(const double eps,resizable_tensor & dest,resizable_tensor & means,resizable_tensor & invstds,const double averaging_factor,resizable_tensor & running_means,resizable_tensor & running_variances,const tensor & src,const tensor & gamma,const tensor & beta)387 void batch_normalize (
388 const double eps,
389 resizable_tensor& dest,
390 resizable_tensor& means,
391 resizable_tensor& invstds,
392 const double averaging_factor,
393 resizable_tensor& running_means,
394 resizable_tensor& running_variances,
395 const tensor& src,
396 const tensor& gamma,
397 const tensor& beta
398 )
399 {
400 DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
401 DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
402 DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
403 DLIB_CASSERT(
404 src.num_samples() > 1 &&
405 gamma.num_samples() == 1 &&
406 beta.num_samples() == 1 &&
407 gamma.nr() == beta.nr() && beta.nr() == src.nr() &&
408 gamma.nc() == beta.nc() && beta.nc() == src.nc() &&
409 gamma.k() == beta.k() && beta.k() == src.k() &&
410 eps > 0,
411 "\ngamma.num_samples(): " << gamma.num_samples() <<
412 "\ngamma.k(): " << gamma.k() <<
413 "\ngamma.nr(): " << gamma.nr() <<
414 "\ngamma.nc(): " << gamma.nc() <<
415 "\nbeta.num_samples(): " << beta.num_samples() <<
416 "\nbeta.k(): " << beta.k() <<
417 "\nbeta.nr(): " << beta.nr() <<
418 "\nbeta.nc(): " << beta.nc() <<
419 "\nsrc.k(): " << src.k() <<
420 "\nsrc.nr(): " << src.nr() <<
421 "\nsrc.nc(): " << src.nc() <<
422 "\neps: " << eps
423 );
424
425 const float in_scale = 1;
426 const float out_scale = 0;
427
428 dest.copy_size(src);
429 means.set_size(1, src.k(), src.nr(), src.nc());
430 invstds.copy_size(means);
431 running_means.copy_size(means);
432 running_variances.copy_size(means);
433 // cuDNN requires that running_means and running_variances be initialized to
434 // some valid float values even if the averaging factor would have ignored
435 // them.
436 if (averaging_factor == 1)
437 {
438 running_means = 0;
439 running_variances = 1;
440 }
441
442 CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
443 context(),
444 CUDNN_BATCHNORM_PER_ACTIVATION,
445 &in_scale,
446 &out_scale,
447 descriptor(src),
448 src.device(),
449 descriptor(dest),
450 dest.device(),
451 descriptor(gamma),
452 gamma.device(),
453 beta.device(),
454 averaging_factor,
455 running_means.device(),
456 running_variances.device(),
457 eps,
458 means.device(),
459 invstds.device()));
460 }
461
batch_normalize_gradient(const double eps,const tensor & gradient_input,const tensor & means,const tensor & invstds,const tensor & src,const tensor & gamma,tensor & src_grad,tensor & gamma_grad,tensor & beta_grad)462 void batch_normalize_gradient(
463 const double eps,
464 const tensor& gradient_input,
465 const tensor& means,
466 const tensor& invstds,
467 const tensor& src,
468 const tensor& gamma,
469 tensor& src_grad,
470 tensor& gamma_grad,
471 tensor& beta_grad
472 )
473 {
474 const long num = src.k()*src.nr()*src.nc();
475 DLIB_CASSERT(src.num_samples() > 1);
476 DLIB_CASSERT(num == (long)means.size());
477 DLIB_CASSERT(num == (long)invstds.size());
478 DLIB_CASSERT(num == (long)gamma.size());
479 DLIB_CASSERT(num == (long)gamma_grad.size());
480 DLIB_CASSERT(num == (long)beta_grad.size());
481 DLIB_CASSERT(have_same_dimensions(gradient_input, src));
482 DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
483 DLIB_CASSERT(eps > 0);
484
485 const float in_scale = 1;
486 const float out_scale = 1;
487 const float in_scale_params = 1;
488 const float out_scale_params = 0;
489
490 CHECK_CUDNN(cudnnBatchNormalizationBackward(
491 context(),
492 CUDNN_BATCHNORM_PER_ACTIVATION,
493 &in_scale,
494 &out_scale,
495 &in_scale_params,
496 &out_scale_params,
497 descriptor(src),
498 src.device(),
499 descriptor(gradient_input),
500 gradient_input.device(),
501 descriptor(src_grad),
502 src_grad.device(),
503 descriptor(gamma),
504 gamma.device(),
505 gamma_grad.device(),
506 beta_grad.device(),
507 eps,
508 means.device(),
509 invstds.device()));
510 }
511
512 // ------------------------------------------------------------------------------------
513
batch_normalize_conv_inference(const double eps,resizable_tensor & dest,const tensor & src,const tensor & gamma,const tensor & beta,const tensor & running_means,const tensor & running_variances)514 void batch_normalize_conv_inference (
515 const double eps,
516 resizable_tensor& dest,
517 const tensor& src,
518 const tensor& gamma,
519 const tensor& beta,
520 const tensor& running_means,
521 const tensor& running_variances
522 )
523 {
524 DLIB_CASSERT(
525 gamma.num_samples() == 1 &&
526 gamma.nr() == 1 &&
527 gamma.nc() == 1 &&
528 gamma.k() == src.k() &&
529 have_same_dimensions(gamma, beta) &&
530 have_same_dimensions(gamma, running_means) &&
531 have_same_dimensions(gamma, running_variances) &&
532 eps > 0,
533 "\ngamma.num_samples(): " << gamma.num_samples() <<
534 "\ngamma.k(): " << gamma.k() <<
535 "\ngamma.nr(): " << gamma.nr() <<
536 "\ngamma.nc(): " << gamma.nc() <<
537 "\nbeta.num_samples(): " << beta.num_samples() <<
538 "\nbeta.k(): " << beta.k() <<
539 "\nbeta.nr(): " << beta.nr() <<
540 "\nbeta.nc(): " << beta.nc() <<
541 "\nrunning_means.num_samples(): " << running_means.num_samples() <<
542 "\nrunning_means.k(): " << running_means.k() <<
543 "\nrunning_means.nr(): " << running_means.nr() <<
544 "\nrunning_means.nc(): " << running_means.nc() <<
545 "\nrunning_variances.num_samples(): " << running_variances.num_samples() <<
546 "\nrunning_variances.k(): " << running_variances.k() <<
547 "\nrunning_variances.nr(): " << running_variances.nr() <<
548 "\nrunning_variances.nc(): " << running_variances.nc() <<
549 "\nsrc.k(): " << src.k() <<
550 "\nsrc.nr(): " << src.nr() <<
551 "\nsrc.nc(): " << src.nc() <<
552 "\neps: " << eps
553 );
554 const float in_scale = 1;
555 const float out_scale = 0;
556
557 dest.copy_size(src);
558
559 CHECK_CUDNN(cudnnBatchNormalizationForwardInference(
560 context(),
561 CUDNN_BATCHNORM_SPATIAL,
562 &in_scale,
563 &out_scale,
564 descriptor(src),
565 src.device(),
566 descriptor(dest),
567 dest.device(),
568 descriptor(gamma),
569 gamma.device(),
570 beta.device(),
571 running_means.device(),
572 running_variances.device(),
573 eps));
574 }
575
batch_normalize_conv(const double eps,resizable_tensor & dest,resizable_tensor & means,resizable_tensor & invstds,const double averaging_factor,resizable_tensor & running_means,resizable_tensor & running_variances,const tensor & src,const tensor & gamma,const tensor & beta)576 void batch_normalize_conv (
577 const double eps,
578 resizable_tensor& dest,
579 resizable_tensor& means,
580 resizable_tensor& invstds,
581 const double averaging_factor,
582 resizable_tensor& running_means,
583 resizable_tensor& running_variances,
584 const tensor& src,
585 const tensor& gamma,
586 const tensor& beta
587 )
588 {
589 DLIB_CASSERT(0 <= averaging_factor && averaging_factor <= 1, "averaging_factor: " << averaging_factor);
590 DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_means,means));
591 DLIB_CASSERT(averaging_factor==1 || have_same_dimensions(running_variances,invstds));
592 DLIB_CASSERT(
593 src.num_samples() > 1 &&
594 gamma.num_samples() == 1 &&
595 beta.num_samples() == 1 &&
596 gamma.nr() == 1 &&
597 beta.nr() == 1 &&
598 gamma.nc() == 1 &&
599 beta.nc() == 1 &&
600 gamma.k() == beta.k() && beta.k() == src.k() &&
601 eps > 0,
602 "\ngamma.num_samples(): " << gamma.num_samples() <<
603 "\ngamma.k(): " << gamma.k() <<
604 "\ngamma.nr(): " << gamma.nr() <<
605 "\ngamma.nc(): " << gamma.nc() <<
606 "\nbeta.num_samples(): " << beta.num_samples() <<
607 "\nbeta.k(): " << beta.k() <<
608 "\nbeta.nr(): " << beta.nr() <<
609 "\nbeta.nc(): " << beta.nc() <<
610 "\nsrc.k(): " << src.k() <<
611 "\nsrc.nr(): " << src.nr() <<
612 "\nsrc.nc(): " << src.nc() <<
613 "\neps: " << eps
614 );
615 const float in_scale = 1;
616 const float out_scale = 0;
617
618 dest.copy_size(src);
619 means.set_size(1, src.k());
620 invstds.copy_size(means);
621 running_means.copy_size(means);
622 running_variances.copy_size(means);
623 // cuDNN requires that running_means and running_variances be initialized to
624 // some valid float values even if the averaging factor would have ignored
625 // them.
626 if (averaging_factor == 1)
627 {
628 running_means = 0;
629 running_variances = 1;
630 }
631
632 CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(
633 context(),
634 CUDNN_BATCHNORM_SPATIAL,
635 &in_scale,
636 &out_scale,
637 descriptor(src),
638 src.device(),
639 descriptor(dest),
640 dest.device(),
641 descriptor(gamma),
642 gamma.device(),
643 beta.device(),
644 averaging_factor,
645 running_means.device(),
646 running_variances.device(),
647 eps,
648 means.device(),
649 invstds.device()));
650 }
651
batch_normalize_conv_gradient(const double eps,const tensor & gradient_input,const tensor & means,const tensor & invstds,const tensor & src,const tensor & gamma,tensor & src_grad,tensor & gamma_grad,tensor & beta_grad)652 void batch_normalize_conv_gradient(
653 const double eps,
654 const tensor& gradient_input,
655 const tensor& means,
656 const tensor& invstds,
657 const tensor& src,
658 const tensor& gamma,
659 tensor& src_grad,
660 tensor& gamma_grad,
661 tensor& beta_grad
662 )
663 {
664 DLIB_CASSERT(src.k() == (long)means.size());
665 DLIB_CASSERT(src.k() == (long)invstds.size());
666 DLIB_CASSERT(src.k() == (long)gamma.size());
667 DLIB_CASSERT(src.k() == (long)gamma_grad.size());
668 DLIB_CASSERT(src.k() == (long)beta_grad.size());
669 DLIB_CASSERT(have_same_dimensions(gradient_input, src));
670 DLIB_CASSERT(have_same_dimensions(gradient_input, src_grad));
671 DLIB_CASSERT(eps > 0);
672
673 const float in_scale = 1;
674 const float out_scale = 1;
675 const float in_scale_params = 1;
676 const float out_scale_params = 0;
677
678 CHECK_CUDNN(cudnnBatchNormalizationBackward(
679 context(),
680 CUDNN_BATCHNORM_SPATIAL,
681 &in_scale,
682 &out_scale,
683 &in_scale_params,
684 &out_scale_params,
685 descriptor(src),
686 src.device(),
687 descriptor(gradient_input),
688 gradient_input.device(),
689 descriptor(src_grad),
690 src_grad.device(),
691 descriptor(gamma),
692 gamma.device(),
693 gamma_grad.device(),
694 beta_grad.device(),
695 eps,
696 means.device(),
697 invstds.device()));
698 }
699
700 // ------------------------------------------------------------------------------------
701 // ------------------------------------------------------------------------------------
702
703 tensor_conv::
tensor_conv()704 tensor_conv(
705 ) :
706 filter_handle(nullptr),
707 conv_handle(nullptr),
708 forward_algo(0),
709 backward_data_algo(0),
710 backward_filters_algo(0)
711 {
712 clear();
713 }
714
715 void tensor_conv::
clear()716 clear (
717 )
718 {
719 if (filter_handle)
720 cudnnDestroyFilterDescriptor((cudnnFilterDescriptor_t)filter_handle);
721 if (conv_handle)
722 cudnnDestroyConvolutionDescriptor((cudnnConvolutionDescriptor_t)conv_handle);
723 filter_handle = nullptr;
724 conv_handle = nullptr;
725 out_num_samples = 0;
726 out_k = 0;
727 out_nr = 0;
728 out_nc = 0;
729
730 stride_y = 0;
731 stride_x = 0;
732 padding_y = 0;
733 padding_x = 0;
734 data_num_samples = 0;
735 data_k = 0;
736 data_nr = 0;
737 data_nc = 0;
738 filters_num_samples = 0;
739 filters_k = 0;
740 filters_nr = 0;
741 filters_nc = 0;
742
743 forward_algo = 0;
744 backward_data_algo = 0;
745 backward_filters_algo = 0;
746
747 forward_workspace_size_in_bytes = 0;
748 backward_data_workspace_size_in_bytes = 0;
749 backward_filters_workspace_size_in_bytes = 0;
750
751 forward_workspace.reset();
752 backward_data_workspace.reset();
753 backward_filters_workspace.reset();
754 }
755
756 // Given an array of cudnn algorithm performance results, like
757 // cudnnConvolutionFwdAlgoPerf_t, pick the best one to use.
758 template <typename T>
pick_best_algorithm(const std::vector<T> & perf_results)759 decltype(std::declval<T>().algo) pick_best_algorithm(const std::vector<T> &perf_results)
760 {
761 DLIB_CASSERT(!perf_results.empty());
762 CHECK_CUDNN(perf_results[0].status);
763 if (dnn_prefer_fastest_algorithms())
764 return perf_results[0].algo;
765
766 // Otherwise we find the algorithm that has a good status and uses the least amount
767 // of memory.
768 size_t best_memory = std::numeric_limits<size_t>::max();
769 decltype(std::declval<T>().algo) best_alg;
770 for (auto&& perf : perf_results)
771 {
772 if (perf.status == CUDNN_STATUS_SUCCESS && perf.memory < best_memory)
773 {
774 best_memory = perf.memory;
775 best_alg = perf.algo;
776 }
777 }
778 return best_alg;
779 }
780
781 void tensor_conv::
select_best_algorithms(const tensor & data,const tensor_descriptor & dest_desc)782 select_best_algorithms (
783 const tensor& data,
784 const tensor_descriptor& dest_desc
785 )
786 {
787 // Calling the cuDNN "find the best algorithm" functions are really slow. So we keep a
788 // cache that tells us what method was best for a particular configuration.
789 thread_local std::map<std::tuple<int,int,int,int,long,long>,
790 std::tuple<int,int,int>> config_to_algo_cache;
791
792 // If we have already found good algorithms for this setting then just pull them from
793 // the cache.
794 const auto cache_key = std::make_tuple(stride_y, stride_x, padding_y, padding_x, filters_nr, filters_nc);
795 const auto iter = config_to_algo_cache.find(cache_key);
796 if (iter != config_to_algo_cache.end())
797 {
798 std::tie(forward_algo, backward_data_algo, backward_filters_algo) = iter->second;
799 return;
800 }
801
802
803 // Pick which forward algorithm we will use and allocate the necessary
804 // workspace buffer.
805 cudnnConvolutionFwdAlgo_t forward_best_algo;
806 #if CUDNN_MAJOR >= 8
807 {
808 int num_possible_algorithms = 0;
809 CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(context(), &num_possible_algorithms));
810 std::vector<cudnnConvolutionFwdAlgoPerf_t> perf_results(num_possible_algorithms);
811 int num_algorithms = 0;
812 CHECK_CUDNN(cudnnFindConvolutionForwardAlgorithm(
813 context(),
814 descriptor(data),
815 (const cudnnFilterDescriptor_t)filter_handle,
816 (const cudnnConvolutionDescriptor_t)conv_handle,
817 descriptor(dest_desc),
818 num_possible_algorithms,
819 &num_algorithms,
820 perf_results.data()));
821 perf_results.resize(num_algorithms);
822 forward_best_algo = pick_best_algorithm(perf_results);
823 }
824 #else
825 CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(
826 context(),
827 descriptor(data),
828 (const cudnnFilterDescriptor_t)filter_handle,
829 (const cudnnConvolutionDescriptor_t)conv_handle,
830 descriptor(dest_desc),
831 dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_FWD_PREFER_FASTEST:CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
832 std::numeric_limits<size_t>::max(),
833 &forward_best_algo));
834 #endif
835 forward_algo = forward_best_algo;
836
837
838
839 // Pick which backward data algorithm we will use and allocate the
840 // necessary workspace buffer.
841 cudnnConvolutionBwdDataAlgo_t backward_data_best_algo;
842 #if CUDNN_MAJOR >= 8
843 {
844 int num_possible_algorithms = 0;
845 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(context(), &num_possible_algorithms));
846 std::vector<cudnnConvolutionBwdDataAlgoPerf_t> perf_results(num_possible_algorithms);
847 int num_algorithms = 0;
848 CHECK_CUDNN(cudnnFindConvolutionBackwardDataAlgorithm(
849 context(),
850 (const cudnnFilterDescriptor_t)filter_handle,
851 descriptor(dest_desc),
852 (const cudnnConvolutionDescriptor_t)conv_handle,
853 descriptor(data),
854 num_possible_algorithms,
855 &num_algorithms,
856 perf_results.data()));
857 perf_results.resize(num_algorithms);
858 backward_data_best_algo = pick_best_algorithm(perf_results);
859 }
860 #else
861 CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(
862 context(),
863 (const cudnnFilterDescriptor_t)filter_handle,
864 descriptor(dest_desc),
865 (const cudnnConvolutionDescriptor_t)conv_handle,
866 descriptor(data),
867 dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE,
868 std::numeric_limits<size_t>::max(),
869 &backward_data_best_algo));
870 #endif
871 backward_data_algo = backward_data_best_algo;
872
873
874
875
876 // Pick which backward filters algorithm we will use and allocate the
877 // necessary workspace buffer.
878 cudnnConvolutionBwdFilterAlgo_t backward_filters_best_algo;
879 #if CUDNN_MAJOR >= 8
880 {
881 int num_possible_algorithms = 0;
882 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(context(), &num_possible_algorithms));
883 std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> perf_results(num_possible_algorithms);
884 int num_algorithms = 0;
885 CHECK_CUDNN(cudnnFindConvolutionBackwardFilterAlgorithm(
886 context(),
887 descriptor(data),
888 descriptor(dest_desc),
889 (const cudnnConvolutionDescriptor_t)conv_handle,
890 (const cudnnFilterDescriptor_t)filter_handle,
891 num_possible_algorithms,
892 &num_algorithms,
893 perf_results.data()));
894 perf_results.resize(num_algorithms);
895 backward_filters_best_algo = pick_best_algorithm(perf_results);
896 }
897 #else
898 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(
899 context(),
900 descriptor(data),
901 descriptor(dest_desc),
902 (const cudnnConvolutionDescriptor_t)conv_handle,
903 (const cudnnFilterDescriptor_t)filter_handle,
904 dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE,
905 std::numeric_limits<size_t>::max(),
906 &backward_filters_best_algo));
907 #endif
908
909 // cuDNN 5.1 has a bug that causes
910 // cudnnGetConvolutionBackwardFilterAlgorithm() to pick the winograd
911 // algorithm even for cases where cuDNN doesn't support it, leading to
912 // incorrect outputs. So here we check if we are in a case where winograd
913 // isn't supported and manually overrule
914 // cudnnGetConvolutionBackwardFilterAlgorithm() by picking a safe
915 // algorithm.
916 if (dnn_prefer_fastest_algorithms() &&
917 !(stride_x == 1 && stride_y == 1 && ((filters_nr==3&&filters_nc==3) || (filters_nr==5&&filters_nc==5)))
918 )
919 {
920 backward_filters_best_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
921 }
922 backward_filters_algo = backward_filters_best_algo;
923
924
925 // Save this algorithm selection in the cache
926 config_to_algo_cache[cache_key] = std::make_tuple(forward_algo, backward_data_algo, backward_filters_algo);
927 }
928
929 void tensor_conv::
setup(const tensor & data,const tensor & filters,int stride_y_,int stride_x_,int padding_y_,int padding_x_)930 setup(
931 const tensor& data,
932 const tensor& filters,
933 int stride_y_,
934 int stride_x_,
935 int padding_y_,
936 int padding_x_
937 )
938 {
939 DLIB_CASSERT(data.k() == filters.k());
940
941 // if the last call to setup gave the same exact settings then don't do
942 // anything.
943 if (data_num_samples == data.num_samples() &&
944 data_k == data.k() &&
945 data_nr == data.nr() &&
946 data_nc == data.nc() &&
947 stride_y_ == stride_y &&
948 stride_x_ == stride_x &&
949 padding_y_ == padding_y &&
950 padding_x_ == padding_x &&
951 filters_num_samples == filters.num_samples() &&
952 filters_k == filters.k() &&
953 filters_nr == filters.nr() &&
954 filters_nc == filters.nc()
955 )
956 {
957 return;
958 }
959
960 clear();
961 try
962 {
963 stride_y = stride_y_;
964 stride_x = stride_x_;
965 padding_y = padding_y_;
966 padding_x = padding_x_;
967 data_num_samples = data.num_samples();
968 data_k = data.k();
969 data_nr = data.nr();
970 data_nc = data.nc();
971 filters_num_samples = filters.num_samples();
972 filters_k = filters.k();
973 filters_nr = filters.nr();
974 filters_nc = filters.nc();
975
976 CHECK_CUDNN(cudnnCreateFilterDescriptor((cudnnFilterDescriptor_t*)&filter_handle));
977 CHECK_CUDNN(cudnnSetFilter4dDescriptor((cudnnFilterDescriptor_t)filter_handle,
978 CUDNN_DATA_FLOAT,
979 CUDNN_TENSOR_NCHW,
980 filters.num_samples(),
981 filters.k(),
982 filters.nr(),
983 filters.nc()));
984
985 CHECK_CUDNN(cudnnCreateConvolutionDescriptor((cudnnConvolutionDescriptor_t*)&conv_handle));
986 #if CUDNN_MAJOR >= 6
987 CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
988 padding_y, // vertical padding
989 padding_x, // horizontal padding
990 stride_y,
991 stride_x,
992 1, 1, // must be 1,1
993 CUDNN_CROSS_CORRELATION,
994 CUDNN_DATA_FLOAT)); // could also be CUDNN_CONVOLUTION
995 #else
996 CHECK_CUDNN(cudnnSetConvolution2dDescriptor((cudnnConvolutionDescriptor_t)conv_handle,
997 padding_y, // vertical padding
998 padding_x, // horizontal padding
999 stride_y,
1000 stride_x,
1001 1, 1, // must be 1,1
1002 CUDNN_CROSS_CORRELATION)); // could also be CUDNN_CONVOLUTION
1003 #endif
1004
1005 CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim(
1006 (const cudnnConvolutionDescriptor_t)conv_handle,
1007 descriptor(data),
1008 (const cudnnFilterDescriptor_t)filter_handle,
1009 &out_num_samples,
1010 &out_k,
1011 &out_nr,
1012 &out_nc));
1013
1014 tensor_descriptor dest_desc;
1015 dest_desc.set_size(out_num_samples,out_k,out_nr,out_nc);
1016
1017 select_best_algorithms(data, dest_desc);
1018
1019 CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(
1020 context(),
1021 descriptor(data),
1022 (const cudnnFilterDescriptor_t)filter_handle,
1023 (const cudnnConvolutionDescriptor_t)conv_handle,
1024 descriptor(dest_desc),
1025 (cudnnConvolutionFwdAlgo_t)forward_algo,
1026 &forward_workspace_size_in_bytes));
1027
1028
1029 CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(
1030 context(),
1031 (const cudnnFilterDescriptor_t)filter_handle,
1032 descriptor(dest_desc),
1033 (const cudnnConvolutionDescriptor_t)conv_handle,
1034 descriptor(data),
1035 (cudnnConvolutionBwdDataAlgo_t)backward_data_algo,
1036 &backward_data_workspace_size_in_bytes));
1037
1038
1039 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(
1040 context(),
1041 descriptor(data),
1042 descriptor(dest_desc),
1043 (const cudnnConvolutionDescriptor_t)conv_handle,
1044 (const cudnnFilterDescriptor_t)filter_handle,
1045 (cudnnConvolutionBwdFilterAlgo_t)backward_filters_algo,
1046 &backward_filters_workspace_size_in_bytes));
1047 }
1048 catch(...)
1049 {
1050 clear();
1051 throw;
1052 }
1053 }
1054
1055 tensor_conv::
~tensor_conv()1056 ~tensor_conv (
1057 )
1058 {
1059 clear();
1060 }
1061
operator ()(const bool add_to_output,resizable_tensor & output,const tensor & data,const tensor & filters)1062 void tensor_conv::operator() (
1063 const bool add_to_output,
1064 resizable_tensor& output,
1065 const tensor& data,
1066 const tensor& filters
1067 )
1068 {
1069 DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
1070
1071 output.set_size(out_num_samples, out_k, out_nr, out_nc);
1072 (*this)(add_to_output, static_cast<tensor&>(output), data, filters);
1073 }
1074
operator ()(const bool add_to_output,tensor & output,const tensor & data,const tensor & filters)1075 void tensor_conv::operator() (
1076 const bool add_to_output,
1077 tensor& output,
1078 const tensor& data,
1079 const tensor& filters
1080 )
1081 {
1082 DLIB_CASSERT(is_same_object(output,data) == false);
1083 DLIB_CASSERT(is_same_object(output,filters) == false);
1084 DLIB_CASSERT(filters.k() == data.k());
1085 DLIB_CASSERT(stride_y > 0 && stride_x > 0, "You must call setup() before calling this function");
1086 DLIB_CASSERT(filters.nc() <= data.nc() + 2*padding_x,
1087 "Filter windows must be small enough to fit into the padded image."
1088 << "\n\t filters.nc(): " << filters.nc()
1089 << "\n\t data.nc(): " << data.nc()
1090 << "\n\t padding_x: " << padding_x
1091 );
1092 DLIB_CASSERT(filters.nr() <= data.nr() + 2*padding_y,
1093 "Filter windows must be small enough to fit into the padded image."
1094 << "\n\t filters.nr(): " << filters.nr()
1095 << "\n\t data.nr(): " << data.nr()
1096 << "\n\t padding_y: " << padding_y
1097 );
1098
1099
1100 DLIB_CASSERT(output.num_samples() == data.num_samples(),out_num_samples << " " << data.num_samples());
1101 DLIB_CASSERT(output.k() == filters.num_samples());
1102 DLIB_CASSERT(output.nr() == 1+(data.nr()+2*padding_y-filters.nr())/stride_y);
1103 DLIB_CASSERT(output.nc() == 1+(data.nc()+2*padding_x-filters.nc())/stride_x);
1104
1105
1106
1107 const float alpha = 1;
1108 const float beta = add_to_output ? 1 : 0;
1109
1110 // Since cudnnConvolutionForward() is an asynchronous call, we need to hold a
1111 // reference to the workspace buffer so we can be sure it isn't reallocated
1112 // while the function is still executing on the device. But each time we come
1113 // here, we make sure to grab the latest workspace buffer so that, globally, we
1114 // minimize the number of such buffers.
1115 forward_workspace = device_global_buffer(forward_workspace_size_in_bytes);
1116
1117 CHECK_CUDNN(cudnnConvolutionForward(
1118 context(),
1119 &alpha,
1120 descriptor(data),
1121 data.device(),
1122 (const cudnnFilterDescriptor_t)filter_handle,
1123 filters.device(),
1124 (const cudnnConvolutionDescriptor_t)conv_handle,
1125 (cudnnConvolutionFwdAlgo_t)forward_algo,
1126 forward_workspace,
1127 forward_workspace_size_in_bytes,
1128 &beta,
1129 descriptor(output),
1130 output.device()));
1131 }
1132
get_gradient_for_data(const bool add_to_output,const tensor & gradient_input,const tensor & filters,tensor & data_gradient)1133 void tensor_conv::get_gradient_for_data (
1134 const bool add_to_output,
1135 const tensor& gradient_input,
1136 const tensor& filters,
1137 tensor& data_gradient
1138 )
1139 {
1140 const float alpha = 1;
1141 const float beta = add_to_output ? 1 : 0;
1142
1143 // Since cudnnConvolutionBackwardData() is an asynchronous call, we need to hold a
1144 // reference to the workspace buffer so we can be sure it isn't reallocated
1145 // while the function is still executing on the device. But each time we come
1146 // here, we make sure to grab the latest workspace buffer so that, globally, we
1147 // minimize the number of such buffers.
1148 backward_data_workspace = device_global_buffer(backward_data_workspace_size_in_bytes);
1149
1150
1151 CHECK_CUDNN(cudnnConvolutionBackwardData(context(),
1152 &alpha,
1153 (const cudnnFilterDescriptor_t)filter_handle,
1154 filters.device(),
1155 descriptor(gradient_input),
1156 gradient_input.device(),
1157 (const cudnnConvolutionDescriptor_t)conv_handle,
1158 (cudnnConvolutionBwdDataAlgo_t)backward_data_algo,
1159 backward_data_workspace,
1160 backward_data_workspace_size_in_bytes,
1161 &beta,
1162 descriptor(data_gradient),
1163 data_gradient.device()));
1164 }
1165
1166 void tensor_conv::
get_gradient_for_filters(const bool add_to_output,const tensor & gradient_input,const tensor & data,tensor & filters_gradient)1167 get_gradient_for_filters (
1168 const bool add_to_output,
1169 const tensor& gradient_input,
1170 const tensor& data,
1171 tensor& filters_gradient
1172 )
1173 {
1174 const float alpha = 1;
1175 const float beta = add_to_output ? 1 : 0;
1176
1177 // Since cudnnConvolutionBackwardFilter() is an asynchronous call, we need to hold a
1178 // reference to the workspace buffer so we can be sure it isn't reallocated
1179 // while the function is still executing on the device. But each time we come
1180 // here, we make sure to grab the latest workspace buffer so that, globally, we
1181 // minimize the number of such buffers.
1182 backward_filters_workspace = device_global_buffer(backward_filters_workspace_size_in_bytes);
1183
1184 CHECK_CUDNN(cudnnConvolutionBackwardFilter(context(),
1185 &alpha,
1186 descriptor(data),
1187 data.device(),
1188 descriptor(gradient_input),
1189 gradient_input.device(),
1190 (const cudnnConvolutionDescriptor_t)conv_handle,
1191 (cudnnConvolutionBwdFilterAlgo_t)backward_filters_algo,
1192 backward_filters_workspace,
1193 backward_filters_workspace_size_in_bytes,
1194 &beta,
1195 (const cudnnFilterDescriptor_t)filter_handle,
1196 filters_gradient.device()));
1197 }
1198
1199 // ------------------------------------------------------------------------------------
1200 // ------------------------------------------------------------------------------------
1201
pooling()1202 pooling::pooling (
1203 ) : handle(nullptr),window_height(0),window_width(0),stride_y(0),stride_x(0),padding_y(0), padding_x(0)
1204 {
1205 }
1206
~pooling()1207 pooling::~pooling(
1208 )
1209 {
1210 clear();
1211 }
1212
1213 void pooling::
clear()1214 clear(
1215 )
1216 {
1217 if (handle)
1218 cudnnDestroyPoolingDescriptor((cudnnPoolingDescriptor_t)handle);
1219 handle = nullptr;
1220 window_height = 0;
1221 window_width = 0;
1222 stride_y = 0;
1223 stride_x = 0;
1224 padding_y = 0;
1225 padding_x = 0;
1226 }
1227
1228 void pooling::
setup_max_pooling(int window_height_,int window_width_,int stride_y_,int stride_x_,int padding_y_,int padding_x_)1229 setup_max_pooling(
1230 int window_height_,
1231 int window_width_,
1232 int stride_y_,
1233 int stride_x_,
1234 int padding_y_,
1235 int padding_x_
1236 )
1237 {
1238 setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_MAX);
1239 do_max_pooling = true;
1240 }
1241
1242 void pooling::
setup_avg_pooling(int window_height_,int window_width_,int stride_y_,int stride_x_,int padding_y_,int padding_x_)1243 setup_avg_pooling(
1244 int window_height_,
1245 int window_width_,
1246 int stride_y_,
1247 int stride_x_,
1248 int padding_y_,
1249 int padding_x_
1250 )
1251 {
1252 setup(window_height_, window_width_, stride_y_, stride_x_, padding_y_, padding_x_, CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING);
1253 do_max_pooling = false;
1254 }
1255
1256 void pooling::
setup(int window_height_,int window_width_,int stride_y_,int stride_x_,int padding_y_,int padding_x_,int pooling_mode)1257 setup(
1258 int window_height_,
1259 int window_width_,
1260 int stride_y_,
1261 int stride_x_,
1262 int padding_y_,
1263 int padding_x_,
1264 int pooling_mode
1265 )
1266 {
1267 DLIB_CASSERT (window_height_ > 0 && window_width_ > 0 &&
1268 stride_y_ > 0 && stride_x_ > 0 ,
1269 "window_height_: " << window_height_
1270 << "\t\n window_width_: " << window_width_
1271 << "\t\n stride_y_: " << stride_y_
1272 << "\t\n stride_x_: " << stride_x_ );
1273 DLIB_CASSERT( 0 <= padding_y_ && padding_y_ < window_height_ &&
1274 0 <= padding_x_ && padding_x_ < window_width_,
1275 "window_height_: " << window_height_
1276 << "\t\n window_width_: " << window_width_
1277 << "\t\n padding_y_: " << padding_y_
1278 << "\t\n padding_x_: " << padding_x_ );
1279
1280 if (window_height == window_height_ &&
1281 window_width == window_width_ &&
1282 stride_y == stride_y_ &&
1283 stride_x == stride_x_ &&
1284 padding_y == padding_y_ &&
1285 padding_x == padding_x_
1286 )
1287 {
1288 return;
1289 }
1290
1291 clear();
1292 try
1293 {
1294 window_height = window_height_;
1295 window_width = window_width_;
1296 stride_x = stride_x_;
1297 stride_y = stride_y_;
1298 padding_y = padding_y_;
1299 padding_x = padding_x_;
1300 cudnnPoolingDescriptor_t poolingDesc;
1301 CHECK_CUDNN(cudnnCreatePoolingDescriptor(&poolingDesc));
1302 handle = poolingDesc;
1303
1304 CHECK_CUDNN(cudnnSetPooling2dDescriptor(poolingDesc,
1305 (cudnnPoolingMode_t)pooling_mode,
1306 CUDNN_PROPAGATE_NAN,
1307 window_height,
1308 window_width,
1309 padding_y,
1310 padding_x,
1311 stride_y,
1312 stride_x));
1313 }
1314 catch(...)
1315 {
1316 clear();
1317 throw;
1318 }
1319 }
1320
1321 void pooling::
operator ()(resizable_tensor & dest,const tensor & src)1322 operator() (
1323 resizable_tensor& dest,
1324 const tensor& src
1325 )
1326 {
1327 DLIB_CASSERT(window_width <= src.nc() + 2*padding_x,
1328 "Pooling windows must be small enough to fit into the padded image."
1329 << "\n\t window_width: " << window_width
1330 << "\n\t src.nc(): " << src.nc()
1331 << "\n\t padding_x: " << padding_x
1332 );
1333 DLIB_CASSERT(window_height <= src.nr() + 2*padding_y,
1334 "Pooling windows must be small enough to fit into the padded image."
1335 << "\n\t window_height: " << window_height
1336 << "\n\t src.nr(): " << src.nr()
1337 << "\n\t padding_y: " << padding_y
1338 );
1339 const float alpha = 1;
1340 const float beta = 0;
1341 int outN;
1342 int outC;
1343 int outH;
1344 int outW;
1345 CHECK_CUDNN(cudnnGetPooling2dForwardOutputDim((const cudnnPoolingDescriptor_t)handle,
1346 descriptor(src),
1347 &outN,
1348 &outC,
1349 &outH,
1350 &outW));
1351
1352
1353 dest.set_size(outN,outC,outH,outW);
1354
1355 DLIB_CASSERT(dest.num_samples() == src.num_samples());
1356 DLIB_CASSERT(dest.k() == src.k());
1357 DLIB_CASSERT(dest.nr() == 1 + (src.nr() + 2*padding_y - window_height)/stride_y,
1358 "\n stride_y: " << stride_y <<
1359 "\n padding_y: " << padding_y <<
1360 "\n window_height: " << window_height <<
1361 "\n src.nr(): " << src.nr() <<
1362 "\n dest.nr(): " << dest.nr() <<
1363 "\n src.nr()/stride_y: " << src.nr()/stride_y);
1364 DLIB_CASSERT(dest.nc() == 1 + (src.nc() + 2*padding_x - window_width)/stride_x,
1365 "\n stride_x: " << stride_x <<
1366 "\n padding_x: " << padding_x <<
1367 "\n window_width: " << window_width <<
1368 "\n src.nc(): " << src.nc() <<
1369 "\n dest.nc(): " << dest.nc() <<
1370 "\n src.nc()/stride_x: " << src.nc()/stride_x);
1371
1372 CHECK_CUDNN(cudnnPoolingForward(context(),
1373 (const cudnnPoolingDescriptor_t)handle,
1374 &alpha,
1375 descriptor(src),
1376 src.device(),
1377 &beta,
1378 descriptor(dest),
1379 dest.device()));
1380 }
1381
get_gradient(const tensor & gradient_input,const tensor & dest,const tensor & src,tensor & grad)1382 void pooling::get_gradient(
1383 const tensor& gradient_input,
1384 const tensor& dest,
1385 const tensor& src,
1386 tensor& grad
1387 )
1388 {
1389 DLIB_CASSERT(have_same_dimensions(gradient_input,dest));
1390 DLIB_CASSERT(have_same_dimensions(src,grad));
1391
1392 const float alpha = 1;
1393 const float beta = 1;
1394 CHECK_CUDNN(cudnnPoolingBackward(context(),
1395 (const cudnnPoolingDescriptor_t)handle,
1396 &alpha,
1397 descriptor(dest),
1398 dest.device(),
1399 descriptor(gradient_input),
1400 gradient_input.device(),
1401 descriptor(src),
1402 src.device(),
1403 &beta,
1404 descriptor(grad),
1405 grad.device()));
1406 }
1407
1408 // ------------------------------------------------------------------------------------
1409 // ------------------------------------------------------------------------------------
1410
softmax(tensor & dest,const tensor & src)1411 void softmax (
1412 tensor& dest,
1413 const tensor& src
1414 )
1415 {
1416 DLIB_CASSERT(have_same_dimensions(dest,src));
1417 if (src.size() == 0)
1418 return;
1419
1420 const float alpha = 1;
1421 const float beta = 0;
1422
1423 CHECK_CUDNN(cudnnSoftmaxForward(context(),
1424 CUDNN_SOFTMAX_ACCURATE,
1425 CUDNN_SOFTMAX_MODE_CHANNEL,
1426 &alpha,
1427 descriptor(src),
1428 src.device(),
1429 &beta,
1430 descriptor(dest),
1431 dest.device()));
1432 }
1433
1434
softmax_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1435 void softmax_gradient (
1436 tensor& grad,
1437 const tensor& dest,
1438 const tensor& gradient_input
1439 )
1440 {
1441 DLIB_CASSERT(
1442 have_same_dimensions(dest,gradient_input) == true &&
1443 have_same_dimensions(dest,grad) == true );
1444 if (dest.size() == 0)
1445 return;
1446
1447 const float alpha = 1;
1448 const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1449 CHECK_CUDNN(cudnnSoftmaxBackward(context(),
1450 CUDNN_SOFTMAX_ACCURATE,
1451 CUDNN_SOFTMAX_MODE_CHANNEL,
1452 &alpha,
1453 descriptor(dest),
1454 dest.device(),
1455 descriptor(gradient_input),
1456 gradient_input.device(),
1457 &beta,
1458 descriptor(grad),
1459 grad.device()));
1460 }
1461
1462 // ------------------------------------------------------------------------------------
1463 // ------------------------------------------------------------------------------------
1464
softmax_all(tensor & dest,const tensor & src)1465 void softmax_all (
1466 tensor& dest,
1467 const tensor& src
1468 )
1469 {
1470 DLIB_CASSERT(have_same_dimensions(dest,src));
1471 if (src.size() == 0)
1472 return;
1473
1474 const float alpha = 1;
1475 const float beta = 0;
1476
1477 CHECK_CUDNN(cudnnSoftmaxForward(context(),
1478 CUDNN_SOFTMAX_ACCURATE,
1479 CUDNN_SOFTMAX_MODE_INSTANCE,
1480 &alpha,
1481 descriptor(src),
1482 src.device(),
1483 &beta,
1484 descriptor(dest),
1485 dest.device()));
1486 }
1487
1488
softmax_all_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1489 void softmax_all_gradient (
1490 tensor& grad,
1491 const tensor& dest,
1492 const tensor& gradient_input
1493 )
1494 {
1495 DLIB_CASSERT(
1496 have_same_dimensions(dest,gradient_input) == true &&
1497 have_same_dimensions(dest,grad) == true );
1498 if (dest.size() == 0)
1499 return;
1500
1501 const float alpha = 1;
1502 const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1503 CHECK_CUDNN(cudnnSoftmaxBackward(context(),
1504 CUDNN_SOFTMAX_ACCURATE,
1505 CUDNN_SOFTMAX_MODE_INSTANCE,
1506 &alpha,
1507 descriptor(dest),
1508 dest.device(),
1509 descriptor(gradient_input),
1510 gradient_input.device(),
1511 &beta,
1512 descriptor(grad),
1513 grad.device()));
1514 }
1515
1516 // ------------------------------------------------------------------------------------
1517 // ------------------------------------------------------------------------------------
1518
sigmoid(tensor & dest,const tensor & src)1519 void sigmoid (
1520 tensor& dest,
1521 const tensor& src
1522 )
1523 {
1524 DLIB_CASSERT(have_same_dimensions(dest,src));
1525 if (src.size() == 0)
1526 return;
1527
1528 const float alpha = 1;
1529 const float beta = 0;
1530 CHECK_CUDNN(cudnnActivationForward(context(),
1531 sigmoid_activation_descriptor(),
1532 &alpha,
1533 descriptor(src),
1534 src.device(),
1535 &beta,
1536 descriptor(dest),
1537 dest.device()));
1538 }
1539
sigmoid_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1540 void sigmoid_gradient (
1541 tensor& grad,
1542 const tensor& dest,
1543 const tensor& gradient_input
1544 )
1545 {
1546 DLIB_CASSERT(
1547 have_same_dimensions(dest,gradient_input) == true &&
1548 have_same_dimensions(dest,grad) == true );
1549 if (dest.size() == 0)
1550 return;
1551
1552 const float alpha = 1;
1553 const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1554 CHECK_CUDNN(cudnnActivationBackward(context(),
1555 sigmoid_activation_descriptor(),
1556 &alpha,
1557 descriptor(dest),
1558 dest.device(),
1559 descriptor(gradient_input),
1560 gradient_input.device(),
1561 descriptor(dest),
1562 dest.device(),
1563 &beta,
1564 descriptor(grad),
1565 grad.device()));
1566 }
1567
1568 // ------------------------------------------------------------------------------------
1569
relu(tensor & dest,const tensor & src)1570 void relu (
1571 tensor& dest,
1572 const tensor& src
1573 )
1574 {
1575 DLIB_CASSERT(have_same_dimensions(dest,src));
1576 if (src.size() == 0)
1577 return;
1578
1579 const float alpha = 1;
1580 const float beta = 0;
1581 CHECK_CUDNN(cudnnActivationForward(context(),
1582 relu_activation_descriptor(),
1583 &alpha,
1584 descriptor(src),
1585 src.device(),
1586 &beta,
1587 descriptor(dest),
1588 dest.device()));
1589 }
1590
relu_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1591 void relu_gradient (
1592 tensor& grad,
1593 const tensor& dest,
1594 const tensor& gradient_input
1595 )
1596 {
1597 DLIB_CASSERT(
1598 have_same_dimensions(dest,gradient_input) == true &&
1599 have_same_dimensions(dest,grad) == true );
1600 if (dest.size() == 0)
1601 return;
1602
1603 const float alpha = 1;
1604 const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1605 CHECK_CUDNN(cudnnActivationBackward(context(),
1606 relu_activation_descriptor(),
1607 &alpha,
1608 descriptor(dest),
1609 dest.device(),
1610 descriptor(gradient_input),
1611 gradient_input.device(),
1612 descriptor(dest),
1613 dest.device(),
1614 &beta,
1615 descriptor(grad),
1616 grad.device()));
1617 }
1618
1619 // ------------------------------------------------------------------------------------
1620
tanh(tensor & dest,const tensor & src)1621 void tanh (
1622 tensor& dest,
1623 const tensor& src
1624 )
1625 {
1626 DLIB_CASSERT(have_same_dimensions(dest,src));
1627 if (src.size() == 0)
1628 return;
1629
1630 const float alpha = 1;
1631 const float beta = 0;
1632 CHECK_CUDNN(cudnnActivationForward(context(),
1633 tanh_activation_descriptor(),
1634 &alpha,
1635 descriptor(src),
1636 src.device(),
1637 &beta,
1638 descriptor(dest),
1639 dest.device()));
1640 }
1641
tanh_gradient(tensor & grad,const tensor & dest,const tensor & gradient_input)1642 void tanh_gradient (
1643 tensor& grad,
1644 const tensor& dest,
1645 const tensor& gradient_input
1646 )
1647 {
1648 DLIB_CASSERT(
1649 have_same_dimensions(dest,gradient_input) == true &&
1650 have_same_dimensions(dest,grad) == true);
1651 if (dest.size() == 0)
1652 return;
1653
1654 const float alpha = 1;
1655 const float beta = is_same_object(grad,gradient_input) ? 0 : 1;
1656 CHECK_CUDNN(cudnnActivationBackward(context(),
1657 tanh_activation_descriptor(),
1658 &alpha,
1659 descriptor(dest),
1660 dest.device(),
1661 descriptor(gradient_input),
1662 gradient_input.device(),
1663 descriptor(dest),
1664 dest.device(),
1665 &beta,
1666 descriptor(grad),
1667 grad.device()));
1668 }
1669
1670 // ------------------------------------------------------------------------------------
1671 }
1672 }
1673
1674 #endif // DLIB_USE_CUDA
1675
1676 #endif // DLIB_DNN_CuDNN_CPP_
1677
1678
1679