1// Tencent is pleased to support the open source community by making ncnn available. 2// 3// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. 4// 5// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6// in compliance with the License. You may obtain a copy of the License at 7// 8// https://opensource.org/licenses/BSD-3-Clause 9// 10// Unless required by applicable law or agreed to in writing, software distributed 11// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12// CONDITIONS OF ANY KIND, either express or implied. See the License for the 13// specific language governing permissions and limitations under the License. 14 15#version 450 16 17#if NCNN_fp16_storage 18#extension GL_EXT_shader_16bit_storage: require 19struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; 20#endif 21#if NCNN_fp16_arithmetic 22#extension GL_EXT_shader_explicit_arithmetic_types_float16: require 23#endif 24 25layout (constant_id = 0) const int ndim = 0; 26 27#define shape_constant_id_offset 1 28layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; 29layout (constant_id = shape_constant_id_offset + 1) const int w = 0; 30layout (constant_id = shape_constant_id_offset + 2) const int h = 0; 31layout (constant_id = shape_constant_id_offset + 3) const int d = 0; 32layout (constant_id = shape_constant_id_offset + 4) const int c = 0; 33layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0; 34 35layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0; 36layout (constant_id = shape_constant_id_offset + 7) const int outw = 0; 37layout (constant_id = shape_constant_id_offset + 8) const int outh = 0; 38layout (constant_id = shape_constant_id_offset + 9) const int outd = 0; 39layout (constant_id = shape_constant_id_offset + 10) const int outc = 0; 40layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0; 41 42#if NCNN_image_shader 43layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; 44layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; 45#else 46#if NCNN_fp16_packed 47layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; 48#else 49layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; 50#endif 51layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; 52#endif 53 54layout (push_constant) uniform parameter 55{ 56 int dims; 57 int w; 58 int h; 59 int d; 60 int c; 61 int cstep; 62 63 int outdims; 64 int outw; 65 int outh; 66 int outd; 67 int outc; 68 int outcstep; 69} p; 70 71void main() 72{ 73 int gx = int(gl_GlobalInvocationID.x); 74 int gy = int(gl_GlobalInvocationID.y); 75 int gz = int(gl_GlobalInvocationID.z); 76 77 if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc)) 78 return; 79 80 ivec4 i4; 81 ivec4 ii4; 82 if (ndim == 1) 83 { 84 i4 = gx * 8 + ivec4(0, 1, 2, 3); 85 ii4 = i4 + 4; 86 } 87 if (ndim == 2) 88 { 89 i4 = (gy * 8) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw); 90 ii4 = i4 + 4 * psc(outw); 91 } 92 if (ndim == 3) 93 { 94 i4 = (gz * 8) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw); 95 ii4 = i4 + 4 * psc(outh) * psc(outw); 96 } 97 if (ndim == 4) 98 { 99 i4 = (gz * 8) * psc(outd) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outd) * psc(outh) * psc(outw); 100 ii4 = i4 + 4 * psc(outd) * psc(outh) * psc(outw); 101 } 102 103 ivec4 x4; 104 ivec4 xx4; 105 ivec4 y4; 106 ivec4 yy4; 107 ivec4 z4; 108 ivec4 zz4; 109 110#if NCNN_image_shader 111 ivec4 lane4; 112 ivec4 lane4_1; 113#else 114 ivec4 v_offset; 115 ivec4 vv_offset; 116#if NCNN_fp16_packed 117 ivec4 lane2; 118 ivec4 lane4; 119#endif 120#endif 121 122 if (psc(dims) == 1) 123 { 124 z4 = ivec4(0); 125 y4 = ivec4(0); 126 x4 = i4; 127 zz4 = ivec4(0); 128 yy4 = ivec4(0); 129 xx4 = ii4; 130 131#if NCNN_image_shader 132 lane4 = x4 % 4; 133 lane4_1 = xx4 % 4; 134 x4 = x4 / 4; 135 xx4 = xx4 / 4; 136#else 137#if NCNN_fp16_packed 138 v_offset = i4 / 2; 139 lane2 = i4 % 2; 140 vv_offset = ii4 / 2; 141 lane4 = ii4 % 2; 142#else 143 v_offset = i4; 144 vv_offset = ii4; 145#endif 146#endif 147 } 148 else if (psc(dims) == 2) 149 { 150 z4 = ivec4(0); 151 y4 = i4 / psc(w); 152 x4 = i4 % psc(w); 153 zz4 = ivec4(0); 154 yy4 = ii4 / psc(w); 155 xx4 = ii4 % psc(w); 156 157#if NCNN_image_shader 158 lane4 = y4 % 4; 159 lane4_1 = yy4 % 4; 160 y4 = y4 / 4; 161 yy4 = yy4 / 4; 162#else 163#if NCNN_fp16_packed 164 v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2; 165 lane2 = y4 % 2; 166 vv_offset = ((yy4 / 4) * psc(w) + xx4) * 2 + (yy4 % 4) / 2; 167 lane4 = yy4 % 2; 168#else 169 v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4; 170 vv_offset = ((yy4 / 4) * psc(w) + xx4) * 4 + yy4 % 4; 171#endif 172#endif 173 } 174 else if (psc(dims) == 3) 175 { 176 int size = psc(w) * psc(h); 177 178 z4 = i4 / size; 179 y4 = i4 % size / psc(w); 180 x4 = i4 % size % psc(w); 181 zz4 = ii4 / size; 182 yy4 = ii4 % size / psc(w); 183 xx4 = ii4 % size % psc(w); 184 185#if NCNN_image_shader 186 lane4 = z4 % 4; 187 lane4_1 = zz4 % 4; 188 z4 = z4 / 4; 189 zz4 = zz4 / 4; 190#else 191#if NCNN_fp16_packed 192 v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2; 193 lane2 = z4 % 2; 194 vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2; 195 lane4 = zz4 % 2; 196#else 197 v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4; 198 vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4; 199#endif 200#endif 201 } 202 else // if (psc(dims) == 4) 203 { 204 int size = psc(w) * psc(h) * psc(d); 205 int dsize = psc(w) * psc(h); 206 207 z4 = i4 / size; 208 ivec4 yd4 = i4 % size / dsize; 209 ivec4 yh4 = i4 % size % dsize / psc(w); 210 x4 = i4 % size % dsize % psc(w); 211 zz4 = ii4 / size; 212 ivec4 yyd4 = ii4 % size / dsize; 213 ivec4 yyh4 = ii4 % size % dsize / psc(w); 214 xx4 = ii4 % size % dsize % psc(w); 215 216 y4 = yd4 * psc(h) + yh4; 217 yy4 = yyd4 * psc(h) + yyh4; 218 219#if NCNN_image_shader 220 lane4 = z4 % 4; 221 lane4_1 = zz4 % 4; 222 z4 = z4 / 4; 223 zz4 = zz4 / 4; 224#else 225#if NCNN_fp16_packed 226 v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2; 227 lane2 = z4 % 2; 228 vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2; 229 lane4 = zz4 % 2; 230#else 231 v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4; 232 vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4; 233#endif 234#endif 235 } 236 237#if NCNN_image_shader 238 afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r)); 239 afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g)); 240 afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b)); 241 afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a)); 242 afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r)); 243 afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g)); 244 afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b)); 245 afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a)); 246 247 afpvec8 v; 248 v[0].r = v0[lane4.r]; 249 v[0].g = v1[lane4.g]; 250 v[0].b = v2[lane4.b]; 251 v[0].a = v3[lane4.a]; 252 v[1].r = v4[lane4_1.r]; 253 v[1].g = v5[lane4_1.g]; 254 v[1].b = v6[lane4_1.b]; 255 v[1].a = v7[lane4_1.a]; 256 257 if (ndim == 1) image3d_st8(top_blob_3d, ivec3(gx, 0, 0), v); 258 if (ndim == 2) image3d_st8(top_blob_3d, ivec3(gx, gy, 0), v); 259 if (ndim == 3 || ndim == 4) image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); 260#else 261 int gi; 262 if (ndim == 1) gi = gx; 263 if (ndim == 2) gi = gy * psc(outw) + gx; 264 if (ndim == 3 || ndim == 4) gi = gz * psc(outcstep) + gy * psc(outw) + gx; 265 266#if NCNN_fp16_packed 267 afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); 268 afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); 269 afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); 270 afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); 271 272 afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); 273 afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); 274 afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); 275 afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); 276 277 afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); 278 279 buffer_st8(top_blob_data, gi, v); 280#else 281 buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); 282#endif 283#endif 284} 285