1// Tencent is pleased to support the open source community by making ncnn available.
2//
3// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4//
5// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6// in compliance with the License. You may obtain a copy of the License at
7//
8// https://opensource.org/licenses/BSD-3-Clause
9//
10// Unless required by applicable law or agreed to in writing, software distributed
11// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13// specific language governing permissions and limitations under the License.
14
15#version 450
16
17#if NCNN_fp16_storage
18#extension GL_EXT_shader_16bit_storage: require
19struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20#endif
21#if NCNN_fp16_arithmetic
22#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23#endif
24
25layout (constant_id = 0) const int ndim = 0;
26
27#define shape_constant_id_offset 1
28layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
29layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
30layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
31layout (constant_id = shape_constant_id_offset + 3) const int d = 0;
32layout (constant_id = shape_constant_id_offset + 4) const int c = 0;
33layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0;
34
35layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0;
36layout (constant_id = shape_constant_id_offset + 7) const int outw = 0;
37layout (constant_id = shape_constant_id_offset + 8) const int outh = 0;
38layout (constant_id = shape_constant_id_offset + 9) const int outd = 0;
39layout (constant_id = shape_constant_id_offset + 10) const int outc = 0;
40layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
41
42#if NCNN_image_shader
43layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
44layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
45#else
46#if NCNN_fp16_packed
47layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
48#else
49layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
50#endif
51layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
52#endif
53
54layout (push_constant) uniform parameter
55{
56    int dims;
57    int w;
58    int h;
59    int d;
60    int c;
61    int cstep;
62
63    int outdims;
64    int outw;
65    int outh;
66    int outd;
67    int outc;
68    int outcstep;
69} p;
70
71void main()
72{
73    int gx = int(gl_GlobalInvocationID.x);
74    int gy = int(gl_GlobalInvocationID.y);
75    int gz = int(gl_GlobalInvocationID.z);
76
77    if (gx >= psc(outw) || gy >= psc(outh) * psc(outd) || gz >= psc(outc))
78        return;
79
80    ivec4 i4;
81    ivec4 ii4;
82    if (ndim == 1)
83    {
84        i4 = gx * 8 + ivec4(0, 1, 2, 3);
85        ii4 = i4 + 4;
86    }
87    if (ndim == 2)
88    {
89        i4 = (gy * 8) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw);
90        ii4 = i4 + 4 * psc(outw);
91    }
92    if (ndim == 3)
93    {
94        i4 = (gz * 8) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw);
95        ii4 = i4 + 4 * psc(outh) * psc(outw);
96    }
97    if (ndim == 4)
98    {
99        i4 = (gz * 8) * psc(outd) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outd) * psc(outh) * psc(outw);
100        ii4 = i4 + 4 * psc(outd) * psc(outh) * psc(outw);
101    }
102
103    ivec4 x4;
104    ivec4 xx4;
105    ivec4 y4;
106    ivec4 yy4;
107    ivec4 z4;
108    ivec4 zz4;
109
110#if NCNN_image_shader
111    ivec4 lane4;
112    ivec4 lane4_1;
113#else
114    ivec4 v_offset;
115    ivec4 vv_offset;
116#if NCNN_fp16_packed
117    ivec4 lane2;
118    ivec4 lane4;
119#endif
120#endif
121
122    if (psc(dims) == 1)
123    {
124        z4 = ivec4(0);
125        y4 = ivec4(0);
126        x4 = i4;
127        zz4 = ivec4(0);
128        yy4 = ivec4(0);
129        xx4 = ii4;
130
131#if NCNN_image_shader
132        lane4 = x4 % 4;
133        lane4_1 = xx4 % 4;
134        x4 = x4 / 4;
135        xx4 = xx4 / 4;
136#else
137#if NCNN_fp16_packed
138        v_offset = i4 / 2;
139        lane2 = i4 % 2;
140        vv_offset = ii4 / 2;
141        lane4 = ii4 % 2;
142#else
143        v_offset = i4;
144        vv_offset = ii4;
145#endif
146#endif
147    }
148    else if (psc(dims) == 2)
149    {
150        z4 = ivec4(0);
151        y4 = i4 / psc(w);
152        x4 = i4 % psc(w);
153        zz4 = ivec4(0);
154        yy4 = ii4 / psc(w);
155        xx4 = ii4 % psc(w);
156
157#if NCNN_image_shader
158        lane4 = y4 % 4;
159        lane4_1 = yy4 % 4;
160        y4 = y4 / 4;
161        yy4 = yy4 / 4;
162#else
163#if NCNN_fp16_packed
164        v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2;
165        lane2 = y4 % 2;
166        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 2 + (yy4 % 4) / 2;
167        lane4 = yy4 % 2;
168#else
169        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
170        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 4 + yy4 % 4;
171#endif
172#endif
173    }
174    else if (psc(dims) == 3)
175    {
176        int size = psc(w) * psc(h);
177
178        z4 = i4 / size;
179        y4 = i4 % size / psc(w);
180        x4 = i4 % size % psc(w);
181        zz4 = ii4 / size;
182        yy4 = ii4 % size / psc(w);
183        xx4 = ii4 % size % psc(w);
184
185#if NCNN_image_shader
186        lane4 = z4 % 4;
187        lane4_1 = zz4 % 4;
188        z4 = z4 / 4;
189        zz4 = zz4 / 4;
190#else
191#if NCNN_fp16_packed
192        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
193        lane2 = z4 % 2;
194        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2;
195        lane4 = zz4 % 2;
196#else
197        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
198        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4;
199#endif
200#endif
201    }
202    else // if (psc(dims) == 4)
203    {
204        int size = psc(w) * psc(h) * psc(d);
205        int dsize = psc(w) * psc(h);
206
207        z4 = i4 / size;
208        ivec4 yd4 = i4 % size / dsize;
209        ivec4 yh4 = i4 % size % dsize / psc(w);
210        x4 = i4 % size % dsize % psc(w);
211        zz4 = ii4 / size;
212        ivec4 yyd4 = ii4 % size / dsize;
213        ivec4 yyh4 = ii4 % size % dsize / psc(w);
214        xx4 = ii4 % size % dsize % psc(w);
215
216        y4 = yd4 * psc(h) + yh4;
217        yy4 = yyd4 * psc(h) + yyh4;
218
219#if NCNN_image_shader
220        lane4 = z4 % 4;
221        lane4_1 = zz4 % 4;
222        z4 = z4 / 4;
223        zz4 = zz4 / 4;
224#else
225#if NCNN_fp16_packed
226        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
227        lane2 = z4 % 2;
228        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2;
229        lane4 = zz4 % 2;
230#else
231        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
232        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4;
233#endif
234#endif
235    }
236
237#if NCNN_image_shader
238    afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r));
239    afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g));
240    afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b));
241    afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a));
242    afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r));
243    afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g));
244    afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b));
245    afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a));
246
247    afpvec8 v;
248    v[0].r = v0[lane4.r];
249    v[0].g = v1[lane4.g];
250    v[0].b = v2[lane4.b];
251    v[0].a = v3[lane4.a];
252    v[1].r = v4[lane4_1.r];
253    v[1].g = v5[lane4_1.g];
254    v[1].b = v6[lane4_1.b];
255    v[1].a = v7[lane4_1.a];
256
257    if (ndim == 1) image3d_st8(top_blob_3d, ivec3(gx, 0, 0), v);
258    if (ndim == 2) image3d_st8(top_blob_3d, ivec3(gx, gy, 0), v);
259    if (ndim == 3 || ndim == 4) image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
260#else
261    int gi;
262    if (ndim == 1) gi = gx;
263    if (ndim == 2) gi = gy * psc(outw) + gx;
264    if (ndim == 3 || ndim == 4) gi = gz * psc(outcstep) + gy * psc(outw) + gx;
265
266#if NCNN_fp16_packed
267    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
268    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
269    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
270    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
271
272    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
273    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
274    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
275    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
276
277    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
278
279    buffer_st8(top_blob_data, gi, v);
280#else
281    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
282#endif
283#endif
284}
285