1// Tencent is pleased to support the open source community by making ncnn available. 2// 3// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. 4// 5// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6// in compliance with the License. You may obtain a copy of the License at 7// 8// https://opensource.org/licenses/BSD-3-Clause 9// 10// Unless required by applicable law or agreed to in writing, software distributed 11// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12// CONDITIONS OF ANY KIND, either express or implied. See the License for the 13// specific language governing permissions and limitations under the License. 14 15#version 450 16 17#if NCNN_fp16_storage 18#extension GL_EXT_shader_16bit_storage: require 19struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; 20#endif 21#if NCNN_fp16_arithmetic 22#extension GL_EXT_shader_explicit_arithmetic_types_float16: require 23#endif 24 25layout (constant_id = 0) const int kernel_w = 1; 26layout (constant_id = 1) const int kernel_h = 1; 27layout (constant_id = 2) const int dilation_w = 1; 28layout (constant_id = 3) const int dilation_h = 1; 29layout (constant_id = 4) const int stride_w = 1; 30layout (constant_id = 5) const int stride_h = 1; 31layout (constant_id = 6) const int bias_term = 0; 32layout (constant_id = 7) const int activation_type = 0; 33layout (constant_id = 8) const float activation_param_0 = 0; 34layout (constant_id = 9) const float activation_param_1 = 0; 35 36#define shape_constant_id_offset 10 37layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; 38layout (constant_id = shape_constant_id_offset + 1) const int w = 0; 39layout (constant_id = shape_constant_id_offset + 2) const int h = 0; 40layout (constant_id = shape_constant_id_offset + 3) const int c = 0; 41layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; 42 43layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; 44layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; 45layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; 46layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; 47layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; 48 49#if NCNN_image_shader 50layout (binding = 0) uniform unfp sampler3D bottom_blob; 51layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; 52layout (binding = 2) uniform unfp sampler3D weight_blob; 53layout (binding = 3) uniform unfp sampler3D bias_blob; 54#else 55layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; 56layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; 57layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; 58layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; 59#endif 60 61layout (push_constant) uniform parameter 62{ 63 int dims; 64 int w; 65 int h; 66 int c; 67 int cstep; 68 69 int outdims; 70 int outw; 71 int outh; 72 int outc; 73 int outcstep; 74} p; 75 76void main() 77{ 78 int gx = int(gl_GlobalInvocationID.x); 79 int gy = int(gl_GlobalInvocationID.y); 80 int gz = int(gl_GlobalInvocationID.z); 81 82 if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) 83 return; 84 85 afpvec4 sum; 86 87 if (bias_term == 1) 88 { 89#if NCNN_image_shader 90 sum = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); 91#else 92 sum = buffer_ld4(bias_data, gz); 93#endif 94 } 95 else 96 { 97 sum = afpvec4(0.f); 98 } 99 100 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; 101 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; 102 103#if NCNN_image_shader 104 for (int y = 0; y < kernel_h; y++) 105 { 106 int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); 107 if (sys < 0 || sys % stride_h != 0) 108 continue; 109 110 int sy = sys / stride_h; 111 if (sy >= psc(h)) 112 continue; 113 114 for (int x = 0; x < kernel_w; x++) 115 { 116 int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); 117 if (sxs < 0 || sxs % stride_w != 0) 118 continue; 119 120 int sx = sxs / stride_w; 121 if (sx >= psc(w)) 122 continue; 123 124 int wx = (y * kernel_w + x) * 4; 125 126 for (int z = 0; z < psc(c); z++) 127 { 128 afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); 129 130 afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); 131 afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); 132 afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); 133 afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); 134 135 // sum += v * k 136 sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); 137 sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); 138 sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); 139 sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); 140 } 141 } 142 } 143#else 144 int w_offset_0 = gz * psc(c) * kernel_w * kernel_h; 145 146 for (int y = 0; y < kernel_h; y++) 147 { 148 int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); 149 if (sys < 0 || sys % stride_h != 0) 150 continue; 151 152 int sy = sys / stride_h; 153 if (sy >= psc(h)) 154 continue; 155 156 for (int x = 0; x < kernel_w; x++) 157 { 158 int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); 159 if (sxs < 0 || sxs % stride_w != 0) 160 continue; 161 162 int sx = sxs / stride_w; 163 if (sx >= psc(w)) 164 continue; 165 166 int v_offset = sy * psc(w) + sx; 167 int w_offset = w_offset_0 + y * kernel_w + x; 168 169 for (int z = 0; z < psc(c); z++) 170 { 171 afpvec8 v = buffer_ld8(bottom_blob_data, v_offset); 172 173 afpvec8 k0 = buffer_ld8(weight_data, w_offset * 4 + 0); 174 afpvec8 k1 = buffer_ld8(weight_data, w_offset * 4 + 1); 175 afpvec8 k2 = buffer_ld8(weight_data, w_offset * 4 + 2); 176 afpvec8 k3 = buffer_ld8(weight_data, w_offset * 4 + 3); 177 178 // sum += v * k 179 sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); 180 sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); 181 sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); 182 sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); 183 184 v_offset += psc(cstep); 185 w_offset += kernel_w * kernel_h; 186 } 187 } 188 } 189#endif 190 191 if (activation_type == 1) 192 { 193 sum = max(sum, afp(0.f)); 194 } 195 if (activation_type == 2) 196 { 197 const afp slope = afp(activation_param_0); 198 sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); 199 } 200 if (activation_type == 3) 201 { 202 const afp const_min = afp(activation_param_0); 203 const afp const_max = afp(activation_param_1); 204 sum = clamp(sum, const_min, const_max); 205 } 206 if (activation_type == 4) 207 { 208 sum = afp(1.f) / (afp(1.f) + exp(-sum)); 209 } 210 211#if NCNN_image_shader 212 image3d_st4(top_blob, ivec3(gx, gy, gz), sum); 213#else 214 const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; 215 216 buffer_st4(top_blob_data, gi, sum); 217#endif 218} 219