1// Tencent is pleased to support the open source community by making ncnn available. 2// 3// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. 4// 5// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6// in compliance with the License. You may obtain a copy of the License at 7// 8// https://opensource.org/licenses/BSD-3-Clause 9// 10// Unless required by applicable law or agreed to in writing, software distributed 11// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12// CONDITIONS OF ANY KIND, either express or implied. See the License for the 13// specific language governing permissions and limitations under the License. 14 15#version 450 16 17#if NCNN_fp16_storage 18#extension GL_EXT_shader_16bit_storage: require 19struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; 20#endif 21#if NCNN_fp16_arithmetic 22#extension GL_EXT_shader_explicit_arithmetic_types_float16: require 23#endif 24 25layout (constant_id = 0) const int bugihfa = 0; 26 27#define shape_constant_id_offset 1 28layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; 29layout (constant_id = shape_constant_id_offset + 1) const int w = 0; 30layout (constant_id = shape_constant_id_offset + 2) const int h = 0; 31layout (constant_id = shape_constant_id_offset + 3) const int c = 0; 32layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; 33 34layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; 35layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; 36layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; 37layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; 38layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; 39 40#if NCNN_image_shader 41layout (binding = 0) uniform unfp sampler3D bottom_blob; 42layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; 43#else 44#if NCNN_fp16_packed 45layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; 46#else 47layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; 48#endif 49layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; 50#endif 51 52layout (push_constant) uniform parameter 53{ 54 int dims; 55 int w; 56 int h; 57 int c; 58 int cstep; 59 60 int outdims; 61 int outw; 62 int outh; 63 int outc; 64 int outcstep; 65 66 int woffset; 67 int hoffset; 68 int coffset; 69} p; 70 71void main() 72{ 73 int gx = int(gl_GlobalInvocationID.x); 74 int gy = int(gl_GlobalInvocationID.y); 75 int gz = int(gl_GlobalInvocationID.z); 76 77 if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) 78 return; 79 80 if (psc(dims) == 1) 81 { 82 ivec4 x4 = gx * 8 + p.woffset + ivec4(0, 1, 2, 3); 83 ivec4 xx4 = x4 + 4; 84 85#if NCNN_image_shader 86 afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x4.r / 4, 0, 0)); 87 afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x4.g / 4, 0, 0)); 88 afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x4.b / 4, 0, 0)); 89 afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x4.a / 4, 0, 0)); 90 afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(xx4.r / 4, 0, 0)); 91 afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(xx4.g / 4, 0, 0)); 92 afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(xx4.b / 4, 0, 0)); 93 afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(xx4.a / 4, 0, 0)); 94 95 afpvec8 v; 96#if NCNN_fp16_arithmetic 97 if (bugihfa == 1) 98 { 99 ivec4 x4m4 = x4 % 4; 100 ivec4 xx4m4 = xx4 % 4; 101 102 if (x4m4.r == 0) v[0].r = v0.r; 103 if (x4m4.r == 1) v[0].r = v0.g; 104 if (x4m4.r == 2) v[0].r = v0.b; 105 if (x4m4.r == 3) v[0].r = v0.a; 106 if (x4m4.g == 0) v[0].g = v1.r; 107 if (x4m4.g == 1) v[0].g = v1.g; 108 if (x4m4.g == 2) v[0].g = v1.b; 109 if (x4m4.g == 3) v[0].g = v1.a; 110 if (x4m4.b == 0) v[0].b = v2.r; 111 if (x4m4.b == 1) v[0].b = v2.g; 112 if (x4m4.b == 2) v[0].b = v2.b; 113 if (x4m4.b == 3) v[0].b = v2.a; 114 if (x4m4.a == 0) v[0].a = v3.r; 115 if (x4m4.a == 1) v[0].a = v3.g; 116 if (x4m4.a == 2) v[0].a = v3.b; 117 if (x4m4.a == 3) v[0].a = v3.a; 118 if (xx4m4.r == 0) v[1].r = v4.r; 119 if (xx4m4.r == 1) v[1].r = v4.g; 120 if (xx4m4.r == 2) v[1].r = v4.b; 121 if (xx4m4.r == 3) v[1].r = v4.a; 122 if (xx4m4.g == 0) v[1].g = v5.r; 123 if (xx4m4.g == 1) v[1].g = v5.g; 124 if (xx4m4.g == 2) v[1].g = v5.b; 125 if (xx4m4.g == 3) v[1].g = v5.a; 126 if (xx4m4.b == 0) v[1].b = v6.r; 127 if (xx4m4.b == 1) v[1].b = v6.g; 128 if (xx4m4.b == 2) v[1].b = v6.b; 129 if (xx4m4.b == 3) v[1].b = v6.a; 130 if (xx4m4.a == 0) v[1].a = v7.r; 131 if (xx4m4.a == 1) v[1].a = v7.g; 132 if (xx4m4.a == 2) v[1].a = v7.b; 133 if (xx4m4.a == 3) v[1].a = v7.a; 134 } 135 else 136#endif 137 { 138 v[0].r = v0[x4.r % 4]; 139 v[0].g = v1[x4.g % 4]; 140 v[0].b = v2[x4.b % 4]; 141 v[0].a = v3[x4.a % 4]; 142 v[1].r = v4[xx4.r % 4]; 143 v[1].g = v5[xx4.g % 4]; 144 v[1].b = v6[xx4.b % 4]; 145 v[1].a = v7[xx4.a % 4]; 146 } 147 148 image3d_st8(top_blob, ivec3(gx, 0, 0), v); 149#else 150#if NCNN_fp16_packed 151 ivec4 v_offset = (x4 / 4) * 2 + (x4 % 4) / 2; 152 ivec4 lane2 = x4 % 2; 153 ivec4 vv_offset = (xx4 / 4) * 2 + (xx4 % 4) / 2; 154 ivec4 lane4 = xx4 % 2; 155 156 afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); 157 afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); 158 afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); 159 afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); 160 161 afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); 162 afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); 163 afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); 164 afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); 165 166 afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); 167 168 buffer_st8(top_blob_data, gx, v); 169#else 170 ivec4 v_offset = (x4 / 4) * 4 + x4 % 4; 171 ivec4 vv_offset = (xx4 / 4) * 4 + xx4 % 4; 172 173 buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset); 174#endif 175#endif 176 } 177 else if (psc(dims) == 2) 178 { 179 int x = gx + p.woffset; 180 ivec4 y4 = gy * 8 + p.hoffset + ivec4(0, 1, 2, 3); 181 ivec4 yy4 = y4 + 4; 182 183#if NCNN_image_shader 184 afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y4.r / 4, 0)); 185 afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y4.g / 4, 0)); 186 afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y4.b / 4, 0)); 187 afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y4.a / 4, 0)); 188 afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, yy4.r / 4, 0)); 189 afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, yy4.g / 4, 0)); 190 afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, yy4.b / 4, 0)); 191 afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, yy4.a / 4, 0)); 192 193 afpvec8 v; 194#if NCNN_fp16_arithmetic 195 if (bugihfa == 1) 196 { 197 ivec4 y4m4 = y4 % 4; 198 ivec4 yy4m4 = yy4 % 4; 199 200 if (y4m4.r == 0) v[0].r = v0.r; 201 if (y4m4.r == 1) v[0].r = v0.g; 202 if (y4m4.r == 2) v[0].r = v0.b; 203 if (y4m4.r == 3) v[0].r = v0.a; 204 if (y4m4.g == 0) v[0].g = v1.r; 205 if (y4m4.g == 1) v[0].g = v1.g; 206 if (y4m4.g == 2) v[0].g = v1.b; 207 if (y4m4.g == 3) v[0].g = v1.a; 208 if (y4m4.b == 0) v[0].b = v2.r; 209 if (y4m4.b == 1) v[0].b = v2.g; 210 if (y4m4.b == 2) v[0].b = v2.b; 211 if (y4m4.b == 3) v[0].b = v2.a; 212 if (y4m4.a == 0) v[0].a = v3.r; 213 if (y4m4.a == 1) v[0].a = v3.g; 214 if (y4m4.a == 2) v[0].a = v3.b; 215 if (y4m4.a == 3) v[0].a = v3.a; 216 if (yy4m4.r == 0) v[1].r = v4.r; 217 if (yy4m4.r == 1) v[1].r = v4.g; 218 if (yy4m4.r == 2) v[1].r = v4.b; 219 if (yy4m4.r == 3) v[1].r = v4.a; 220 if (yy4m4.g == 0) v[1].g = v5.r; 221 if (yy4m4.g == 1) v[1].g = v5.g; 222 if (yy4m4.g == 2) v[1].g = v5.b; 223 if (yy4m4.g == 3) v[1].g = v5.a; 224 if (yy4m4.b == 0) v[1].b = v6.r; 225 if (yy4m4.b == 1) v[1].b = v6.g; 226 if (yy4m4.b == 2) v[1].b = v6.b; 227 if (yy4m4.b == 3) v[1].b = v6.a; 228 if (yy4m4.a == 0) v[1].a = v7.r; 229 if (yy4m4.a == 1) v[1].a = v7.g; 230 if (yy4m4.a == 2) v[1].a = v7.b; 231 if (yy4m4.a == 3) v[1].a = v7.a; 232 } 233 else 234#endif 235 { 236 v[0].r = v0[y4.r % 4]; 237 v[0].g = v1[y4.g % 4]; 238 v[0].b = v2[y4.b % 4]; 239 v[0].a = v3[y4.a % 4]; 240 v[1].r = v4[yy4.r % 4]; 241 v[1].g = v5[yy4.g % 4]; 242 v[1].b = v6[yy4.b % 4]; 243 v[1].a = v7[yy4.a % 4]; 244 } 245 246 image3d_st8(top_blob, ivec3(gx, gy, 0), v); 247#else 248 int gi = gy * psc(outw) + gx; 249 250#if NCNN_fp16_packed 251 ivec4 v_offset = ((y4 / 4) * psc(w) + x) * 2 + (y4 % 4) / 2; 252 ivec4 lane2 = y4 % 2; 253 ivec4 vv_offset = ((yy4 / 4) * psc(w) + x) * 2 + (yy4 % 4) / 2; 254 ivec4 lane4 = yy4 % 2; 255 256 afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); 257 afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); 258 afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); 259 afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); 260 261 afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); 262 afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); 263 afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); 264 afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); 265 266 afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); 267 268 buffer_st8(top_blob_data, gi, v); 269#else 270 ivec4 v_offset = ((y4 / 4) * psc(w) + x) * 4 + y4 % 4; 271 ivec4 vv_offset = ((yy4 / 4) * psc(w) + x) * 4 + yy4 % 4; 272 273 buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); 274#endif 275#endif 276 } 277 else if (psc(dims) == 3) 278 { 279 int x = gx + p.woffset; 280 int y = gy + p.hoffset; 281 ivec4 z4 = gz * 8 + p.coffset + ivec4(0, 1, 2, 3); 282 ivec4 zz4 = z4 + 4; 283 284#if NCNN_image_shader 285 afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z4.r / 4)); 286 afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z4.g / 4)); 287 afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y, z4.b / 4)); 288 afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y, z4.a / 4)); 289 afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.r / 4)); 290 afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.g / 4)); 291 afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.b / 4)); 292 afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.a / 4)); 293 294 afpvec8 v; 295#if NCNN_fp16_arithmetic 296 if (bugihfa == 1) 297 { 298 ivec4 z4m4 = z4 % 4; 299 ivec4 zz4m4 = zz4 % 4; 300 301 if (z4m4.r == 0) v[0].r = v0.r; 302 if (z4m4.r == 1) v[0].r = v0.g; 303 if (z4m4.r == 2) v[0].r = v0.b; 304 if (z4m4.r == 3) v[0].r = v0.a; 305 if (z4m4.g == 0) v[0].g = v1.r; 306 if (z4m4.g == 1) v[0].g = v1.g; 307 if (z4m4.g == 2) v[0].g = v1.b; 308 if (z4m4.g == 3) v[0].g = v1.a; 309 if (z4m4.b == 0) v[0].b = v2.r; 310 if (z4m4.b == 1) v[0].b = v2.g; 311 if (z4m4.b == 2) v[0].b = v2.b; 312 if (z4m4.b == 3) v[0].b = v2.a; 313 if (z4m4.a == 0) v[0].a = v3.r; 314 if (z4m4.a == 1) v[0].a = v3.g; 315 if (z4m4.a == 2) v[0].a = v3.b; 316 if (z4m4.a == 3) v[0].a = v3.a; 317 if (zz4m4.r == 0) v[1].r = v4.r; 318 if (zz4m4.r == 1) v[1].r = v4.g; 319 if (zz4m4.r == 2) v[1].r = v4.b; 320 if (zz4m4.r == 3) v[1].r = v4.a; 321 if (zz4m4.g == 0) v[1].g = v5.r; 322 if (zz4m4.g == 1) v[1].g = v5.g; 323 if (zz4m4.g == 2) v[1].g = v5.b; 324 if (zz4m4.g == 3) v[1].g = v5.a; 325 if (zz4m4.b == 0) v[1].b = v6.r; 326 if (zz4m4.b == 1) v[1].b = v6.g; 327 if (zz4m4.b == 2) v[1].b = v6.b; 328 if (zz4m4.b == 3) v[1].b = v6.a; 329 if (zz4m4.a == 0) v[1].a = v7.r; 330 if (zz4m4.a == 1) v[1].a = v7.g; 331 if (zz4m4.a == 2) v[1].a = v7.b; 332 if (zz4m4.a == 3) v[1].a = v7.a; 333 } 334 else 335#endif 336 { 337 v[0].r = v0[z4.r % 4]; 338 v[0].g = v1[z4.g % 4]; 339 v[0].b = v2[z4.b % 4]; 340 v[0].a = v3[z4.a % 4]; 341 v[1].r = v4[zz4.r % 4]; 342 v[1].g = v5[zz4.g % 4]; 343 v[1].b = v6[zz4.b % 4]; 344 v[1].a = v7[zz4.a % 4]; 345 } 346 347 image3d_st8(top_blob, ivec3(gx, gy, gz), v); 348#else 349 int gi = gz * psc(outcstep) + gy * psc(outw) + gx; 350 351#if NCNN_fp16_packed 352 ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z4 % 4) / 2; 353 ivec4 lane2 = z4 % 2; 354 ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (zz4 % 4) / 2; 355 ivec4 lane4 = zz4 % 2; 356 357 afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); 358 afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); 359 afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); 360 afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); 361 362 afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); 363 afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); 364 afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); 365 afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); 366 367 afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); 368 369 buffer_st8(top_blob_data, gi, v); 370#else 371 ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + z4 % 4; 372 ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + zz4 % 4; 373 374 buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); 375#endif 376#endif 377 } 378} 379