1// Tencent is pleased to support the open source community by making ncnn available.
2//
3// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4//
5// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6// in compliance with the License. You may obtain a copy of the License at
7//
8// https://opensource.org/licenses/BSD-3-Clause
9//
10// Unless required by applicable law or agreed to in writing, software distributed
11// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13// specific language governing permissions and limitations under the License.
14
15#version 450
16
17#if NCNN_fp16_storage
18#extension GL_EXT_shader_16bit_storage: require
19struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20#endif
21#if NCNN_fp16_arithmetic
22#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23#endif
24
25layout (constant_id = 0) const int bugihfa = 0;
26
27#define shape_constant_id_offset 1
28layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
29layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
30layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
31layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
32layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
33
34layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
35layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
36layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
37layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
38layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
39
40#if NCNN_image_shader
41layout (binding = 0) uniform unfp sampler3D bottom_blob;
42layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
43#else
44#if NCNN_fp16_packed
45layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
46#else
47layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
48#endif
49layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
50#endif
51
52layout (push_constant) uniform parameter
53{
54    int dims;
55    int w;
56    int h;
57    int c;
58    int cstep;
59
60    int outdims;
61    int outw;
62    int outh;
63    int outc;
64    int outcstep;
65
66    int woffset;
67    int hoffset;
68    int coffset;
69} p;
70
71void main()
72{
73    int gx = int(gl_GlobalInvocationID.x);
74    int gy = int(gl_GlobalInvocationID.y);
75    int gz = int(gl_GlobalInvocationID.z);
76
77    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
78        return;
79
80    if (psc(dims) == 1)
81    {
82        ivec4 x4 = gx * 8 + p.woffset + ivec4(0, 1, 2, 3);
83        ivec4 xx4 = x4 + 4;
84
85#if NCNN_image_shader
86        afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x4.r / 4, 0, 0));
87        afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x4.g / 4, 0, 0));
88        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x4.b / 4, 0, 0));
89        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x4.a / 4, 0, 0));
90        afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(xx4.r / 4, 0, 0));
91        afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(xx4.g / 4, 0, 0));
92        afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(xx4.b / 4, 0, 0));
93        afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(xx4.a / 4, 0, 0));
94
95        afpvec8 v;
96#if NCNN_fp16_arithmetic
97        if (bugihfa == 1)
98        {
99            ivec4 x4m4 = x4 % 4;
100            ivec4 xx4m4 = xx4 % 4;
101
102            if (x4m4.r == 0) v[0].r = v0.r;
103            if (x4m4.r == 1) v[0].r = v0.g;
104            if (x4m4.r == 2) v[0].r = v0.b;
105            if (x4m4.r == 3) v[0].r = v0.a;
106            if (x4m4.g == 0) v[0].g = v1.r;
107            if (x4m4.g == 1) v[0].g = v1.g;
108            if (x4m4.g == 2) v[0].g = v1.b;
109            if (x4m4.g == 3) v[0].g = v1.a;
110            if (x4m4.b == 0) v[0].b = v2.r;
111            if (x4m4.b == 1) v[0].b = v2.g;
112            if (x4m4.b == 2) v[0].b = v2.b;
113            if (x4m4.b == 3) v[0].b = v2.a;
114            if (x4m4.a == 0) v[0].a = v3.r;
115            if (x4m4.a == 1) v[0].a = v3.g;
116            if (x4m4.a == 2) v[0].a = v3.b;
117            if (x4m4.a == 3) v[0].a = v3.a;
118            if (xx4m4.r == 0) v[1].r = v4.r;
119            if (xx4m4.r == 1) v[1].r = v4.g;
120            if (xx4m4.r == 2) v[1].r = v4.b;
121            if (xx4m4.r == 3) v[1].r = v4.a;
122            if (xx4m4.g == 0) v[1].g = v5.r;
123            if (xx4m4.g == 1) v[1].g = v5.g;
124            if (xx4m4.g == 2) v[1].g = v5.b;
125            if (xx4m4.g == 3) v[1].g = v5.a;
126            if (xx4m4.b == 0) v[1].b = v6.r;
127            if (xx4m4.b == 1) v[1].b = v6.g;
128            if (xx4m4.b == 2) v[1].b = v6.b;
129            if (xx4m4.b == 3) v[1].b = v6.a;
130            if (xx4m4.a == 0) v[1].a = v7.r;
131            if (xx4m4.a == 1) v[1].a = v7.g;
132            if (xx4m4.a == 2) v[1].a = v7.b;
133            if (xx4m4.a == 3) v[1].a = v7.a;
134        }
135        else
136#endif
137        {
138            v[0].r = v0[x4.r % 4];
139            v[0].g = v1[x4.g % 4];
140            v[0].b = v2[x4.b % 4];
141            v[0].a = v3[x4.a % 4];
142            v[1].r = v4[xx4.r % 4];
143            v[1].g = v5[xx4.g % 4];
144            v[1].b = v6[xx4.b % 4];
145            v[1].a = v7[xx4.a % 4];
146        }
147
148        image3d_st8(top_blob, ivec3(gx, 0, 0), v);
149#else
150#if NCNN_fp16_packed
151        ivec4 v_offset = (x4 / 4) * 2 + (x4 % 4) / 2;
152        ivec4 lane2 = x4 % 2;
153        ivec4 vv_offset = (xx4 / 4) * 2 + (xx4 % 4) / 2;
154        ivec4 lane4 = xx4 % 2;
155
156        afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
157        afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
158        afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
159        afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
160
161        afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
162        afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
163        afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
164        afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
165
166        afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
167
168        buffer_st8(top_blob_data, gx, v);
169#else
170        ivec4 v_offset = (x4 / 4) * 4 + x4 % 4;
171        ivec4 vv_offset = (xx4 / 4) * 4 + xx4 % 4;
172
173        buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset);
174#endif
175#endif
176    }
177    else if (psc(dims) == 2)
178    {
179        int x = gx + p.woffset;
180        ivec4 y4 = gy * 8 + p.hoffset + ivec4(0, 1, 2, 3);
181        ivec4 yy4 = y4 + 4;
182
183#if NCNN_image_shader
184        afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y4.r / 4, 0));
185        afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y4.g / 4, 0));
186        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y4.b / 4, 0));
187        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y4.a / 4, 0));
188        afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, yy4.r / 4, 0));
189        afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, yy4.g / 4, 0));
190        afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, yy4.b / 4, 0));
191        afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, yy4.a / 4, 0));
192
193        afpvec8 v;
194#if NCNN_fp16_arithmetic
195        if (bugihfa == 1)
196        {
197            ivec4 y4m4 = y4 % 4;
198            ivec4 yy4m4 = yy4 % 4;
199
200            if (y4m4.r == 0) v[0].r = v0.r;
201            if (y4m4.r == 1) v[0].r = v0.g;
202            if (y4m4.r == 2) v[0].r = v0.b;
203            if (y4m4.r == 3) v[0].r = v0.a;
204            if (y4m4.g == 0) v[0].g = v1.r;
205            if (y4m4.g == 1) v[0].g = v1.g;
206            if (y4m4.g == 2) v[0].g = v1.b;
207            if (y4m4.g == 3) v[0].g = v1.a;
208            if (y4m4.b == 0) v[0].b = v2.r;
209            if (y4m4.b == 1) v[0].b = v2.g;
210            if (y4m4.b == 2) v[0].b = v2.b;
211            if (y4m4.b == 3) v[0].b = v2.a;
212            if (y4m4.a == 0) v[0].a = v3.r;
213            if (y4m4.a == 1) v[0].a = v3.g;
214            if (y4m4.a == 2) v[0].a = v3.b;
215            if (y4m4.a == 3) v[0].a = v3.a;
216            if (yy4m4.r == 0) v[1].r = v4.r;
217            if (yy4m4.r == 1) v[1].r = v4.g;
218            if (yy4m4.r == 2) v[1].r = v4.b;
219            if (yy4m4.r == 3) v[1].r = v4.a;
220            if (yy4m4.g == 0) v[1].g = v5.r;
221            if (yy4m4.g == 1) v[1].g = v5.g;
222            if (yy4m4.g == 2) v[1].g = v5.b;
223            if (yy4m4.g == 3) v[1].g = v5.a;
224            if (yy4m4.b == 0) v[1].b = v6.r;
225            if (yy4m4.b == 1) v[1].b = v6.g;
226            if (yy4m4.b == 2) v[1].b = v6.b;
227            if (yy4m4.b == 3) v[1].b = v6.a;
228            if (yy4m4.a == 0) v[1].a = v7.r;
229            if (yy4m4.a == 1) v[1].a = v7.g;
230            if (yy4m4.a == 2) v[1].a = v7.b;
231            if (yy4m4.a == 3) v[1].a = v7.a;
232        }
233        else
234#endif
235        {
236            v[0].r = v0[y4.r % 4];
237            v[0].g = v1[y4.g % 4];
238            v[0].b = v2[y4.b % 4];
239            v[0].a = v3[y4.a % 4];
240            v[1].r = v4[yy4.r % 4];
241            v[1].g = v5[yy4.g % 4];
242            v[1].b = v6[yy4.b % 4];
243            v[1].a = v7[yy4.a % 4];
244        }
245
246        image3d_st8(top_blob, ivec3(gx, gy, 0), v);
247#else
248        int gi = gy * psc(outw) + gx;
249
250#if NCNN_fp16_packed
251        ivec4 v_offset = ((y4 / 4) * psc(w) + x) * 2 + (y4 % 4) / 2;
252        ivec4 lane2 = y4 % 2;
253        ivec4 vv_offset = ((yy4 / 4) * psc(w) + x) * 2 + (yy4 % 4) / 2;
254        ivec4 lane4 = yy4 % 2;
255
256        afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
257        afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
258        afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
259        afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
260
261        afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
262        afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
263        afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
264        afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
265
266        afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
267
268        buffer_st8(top_blob_data, gi, v);
269#else
270        ivec4 v_offset = ((y4 / 4) * psc(w) + x) * 4 + y4 % 4;
271        ivec4 vv_offset = ((yy4 / 4) * psc(w) + x) * 4 + yy4 % 4;
272
273        buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
274#endif
275#endif
276    }
277    else if (psc(dims) == 3)
278    {
279        int x = gx + p.woffset;
280        int y = gy + p.hoffset;
281        ivec4 z4 = gz * 8 + p.coffset + ivec4(0, 1, 2, 3);
282        ivec4 zz4 = z4 + 4;
283
284#if NCNN_image_shader
285        afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z4.r / 4));
286        afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z4.g / 4));
287        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y, z4.b / 4));
288        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y, z4.a / 4));
289        afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.r / 4));
290        afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.g / 4));
291        afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.b / 4));
292        afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.a / 4));
293
294        afpvec8 v;
295#if NCNN_fp16_arithmetic
296        if (bugihfa == 1)
297        {
298            ivec4 z4m4 = z4 % 4;
299            ivec4 zz4m4 = zz4 % 4;
300
301            if (z4m4.r == 0) v[0].r = v0.r;
302            if (z4m4.r == 1) v[0].r = v0.g;
303            if (z4m4.r == 2) v[0].r = v0.b;
304            if (z4m4.r == 3) v[0].r = v0.a;
305            if (z4m4.g == 0) v[0].g = v1.r;
306            if (z4m4.g == 1) v[0].g = v1.g;
307            if (z4m4.g == 2) v[0].g = v1.b;
308            if (z4m4.g == 3) v[0].g = v1.a;
309            if (z4m4.b == 0) v[0].b = v2.r;
310            if (z4m4.b == 1) v[0].b = v2.g;
311            if (z4m4.b == 2) v[0].b = v2.b;
312            if (z4m4.b == 3) v[0].b = v2.a;
313            if (z4m4.a == 0) v[0].a = v3.r;
314            if (z4m4.a == 1) v[0].a = v3.g;
315            if (z4m4.a == 2) v[0].a = v3.b;
316            if (z4m4.a == 3) v[0].a = v3.a;
317            if (zz4m4.r == 0) v[1].r = v4.r;
318            if (zz4m4.r == 1) v[1].r = v4.g;
319            if (zz4m4.r == 2) v[1].r = v4.b;
320            if (zz4m4.r == 3) v[1].r = v4.a;
321            if (zz4m4.g == 0) v[1].g = v5.r;
322            if (zz4m4.g == 1) v[1].g = v5.g;
323            if (zz4m4.g == 2) v[1].g = v5.b;
324            if (zz4m4.g == 3) v[1].g = v5.a;
325            if (zz4m4.b == 0) v[1].b = v6.r;
326            if (zz4m4.b == 1) v[1].b = v6.g;
327            if (zz4m4.b == 2) v[1].b = v6.b;
328            if (zz4m4.b == 3) v[1].b = v6.a;
329            if (zz4m4.a == 0) v[1].a = v7.r;
330            if (zz4m4.a == 1) v[1].a = v7.g;
331            if (zz4m4.a == 2) v[1].a = v7.b;
332            if (zz4m4.a == 3) v[1].a = v7.a;
333        }
334        else
335#endif
336        {
337            v[0].r = v0[z4.r % 4];
338            v[0].g = v1[z4.g % 4];
339            v[0].b = v2[z4.b % 4];
340            v[0].a = v3[z4.a % 4];
341            v[1].r = v4[zz4.r % 4];
342            v[1].g = v5[zz4.g % 4];
343            v[1].b = v6[zz4.b % 4];
344            v[1].a = v7[zz4.a % 4];
345        }
346
347        image3d_st8(top_blob, ivec3(gx, gy, gz), v);
348#else
349        int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
350
351#if NCNN_fp16_packed
352        ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z4 % 4) / 2;
353        ivec4 lane2 = z4 % 2;
354        ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (zz4 % 4) / 2;
355        ivec4 lane4 = zz4 % 2;
356
357        afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
358        afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
359        afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
360        afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
361
362        afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
363        afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
364        afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
365        afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
366
367        afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
368
369        buffer_st8(top_blob_data, gi, v);
370#else
371        ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + z4 % 4;
372        ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + zz4 % 4;
373
374        buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
375#endif
376#endif
377    }
378}
379