1// Tencent is pleased to support the open source community by making ncnn available.
2//
3// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4//
5// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6// in compliance with the License. You may obtain a copy of the License at
7//
8// https://opensource.org/licenses/BSD-3-Clause
9//
10// Unless required by applicable law or agreed to in writing, software distributed
11// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13// specific language governing permissions and limitations under the License.
14
15#version 450
16
17#if NCNN_fp16_storage
18#extension GL_EXT_shader_16bit_storage: require
19struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
20#endif
21#if NCNN_fp16_arithmetic
22#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
23#endif
24
25layout (constant_id = 0) const int order_type = 0;
26layout (constant_id = 1) const int bugihfa = 0;
27
28#define shape_constant_id_offset 2
29layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
30layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
31layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
32layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
33layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
34
35layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
36layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
37layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
38layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
39layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
40
41#if NCNN_image_shader
42layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
43layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
44#else
45layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
46layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
47#endif
48
49layout (push_constant) uniform parameter
50{
51    int dims;
52    int w;
53    int h;
54    int c;
55    int cstep;
56
57    int outdims;
58    int outw;
59    int outh;
60    int outc;
61    int outcstep;
62} p;
63
64void main()
65{
66    int gx = int(gl_GlobalInvocationID.x);
67    int gy = int(gl_GlobalInvocationID.y);
68    int gz = int(gl_GlobalInvocationID.z);
69
70    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
71        return;
72
73#if NCNN_image_shader
74    if (psc(dims) == 2)
75    {
76        // order_type
77        // 0 = w h
78        // 1 = h w
79
80        afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, 0));
81
82        if (order_type == 0)
83        {
84            ivec4 y4 = gy * 8 + ivec4(0, 1, 2, 3);
85            ivec4 yy4 = y4 + 4;
86
87            image3d_st1(top_blob_3d, ivec3(gx, y4.r, 0), v[0].r);
88            image3d_st1(top_blob_3d, ivec3(gx, y4.g, 0), v[0].g);
89            image3d_st1(top_blob_3d, ivec3(gx, y4.b, 0), v[0].b);
90            image3d_st1(top_blob_3d, ivec3(gx, y4.a, 0), v[0].a);
91            image3d_st1(top_blob_3d, ivec3(gx, yy4.r, 0), v[1].r);
92            image3d_st1(top_blob_3d, ivec3(gx, yy4.g, 0), v[1].g);
93            image3d_st1(top_blob_3d, ivec3(gx, yy4.b, 0), v[1].b);
94            image3d_st1(top_blob_3d, ivec3(gx, yy4.a, 0), v[1].a);
95        }
96        if (order_type == 1)
97        {
98            ivec4 x4 = gy * 8 + ivec4(0, 1, 2, 3);
99            ivec4 xx4 = x4 + 4;
100
101            image3d_st1(top_blob_3d, ivec3(x4.r, gx, 0), v[0].r);
102            image3d_st1(top_blob_3d, ivec3(x4.g, gx, 0), v[0].g);
103            image3d_st1(top_blob_3d, ivec3(x4.b, gx, 0), v[0].b);
104            image3d_st1(top_blob_3d, ivec3(x4.a, gx, 0), v[0].a);
105            image3d_st1(top_blob_3d, ivec3(xx4.r, gx, 0), v[1].r);
106            image3d_st1(top_blob_3d, ivec3(xx4.g, gx, 0), v[1].g);
107            image3d_st1(top_blob_3d, ivec3(xx4.b, gx, 0), v[1].b);
108            image3d_st1(top_blob_3d, ivec3(xx4.a, gx, 0), v[1].a);
109        }
110    }
111    else // if (psc(dims) == 3)
112    {
113        // order_type
114        // 0 = w h c
115        // 1 = h w c
116        // 2 = w c h
117        // 3 = c w h
118        // 4 = h c w
119        // 5 = c h w
120
121        afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
122
123        if (order_type == 0)
124        {
125            ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
126            ivec4 zz4 = z4 + 4;
127
128            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.r), v[0].r);
129            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.g), v[0].g);
130            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.b), v[0].b);
131            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.a), v[0].a);
132            image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.r), v[1].r);
133            image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.g), v[1].g);
134            image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.b), v[1].b);
135            image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.a), v[1].a);
136        }
137        if (order_type == 1)
138        {
139            ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
140            ivec4 zz4 = z4 + 4;
141
142            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.r), v[0].r);
143            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.g), v[0].g);
144            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.b), v[0].b);
145            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.a), v[0].a);
146            image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.r), v[1].r);
147            image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.g), v[1].g);
148            image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.b), v[1].b);
149            image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.a), v[1].a);
150        }
151        if (order_type == 2)
152        {
153            ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3);
154            ivec4 yy4 = y4 + 4;
155
156            image3d_st1(top_blob_3d, ivec3(gx, y4.r, gy), v[0].r);
157            image3d_st1(top_blob_3d, ivec3(gx, y4.g, gy), v[0].g);
158            image3d_st1(top_blob_3d, ivec3(gx, y4.b, gy), v[0].b);
159            image3d_st1(top_blob_3d, ivec3(gx, y4.a, gy), v[0].a);
160            image3d_st1(top_blob_3d, ivec3(gx, yy4.r, gy), v[1].r);
161            image3d_st1(top_blob_3d, ivec3(gx, yy4.g, gy), v[1].g);
162            image3d_st1(top_blob_3d, ivec3(gx, yy4.b, gy), v[1].b);
163            image3d_st1(top_blob_3d, ivec3(gx, yy4.a, gy), v[1].a);
164        }
165        if (order_type == 3)
166        {
167            ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3);
168            ivec4 xx4 = x4 + 4;
169
170            image3d_st1(top_blob_3d, ivec3(x4.r, gx, gy), v[0].r);
171            image3d_st1(top_blob_3d, ivec3(x4.g, gx, gy), v[0].g);
172            image3d_st1(top_blob_3d, ivec3(x4.b, gx, gy), v[0].b);
173            image3d_st1(top_blob_3d, ivec3(x4.a, gx, gy), v[0].a);
174            image3d_st1(top_blob_3d, ivec3(xx4.r, gx, gy), v[1].r);
175            image3d_st1(top_blob_3d, ivec3(xx4.g, gx, gy), v[1].g);
176            image3d_st1(top_blob_3d, ivec3(xx4.b, gx, gy), v[1].b);
177            image3d_st1(top_blob_3d, ivec3(xx4.a, gx, gy), v[1].a);
178        }
179        if (order_type == 4)
180        {
181            ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3);
182            ivec4 yy4 = y4 + 4;
183
184            image3d_st1(top_blob_3d, ivec3(gy, y4.r, gx), v[0].r);
185            image3d_st1(top_blob_3d, ivec3(gy, y4.g, gx), v[0].g);
186            image3d_st1(top_blob_3d, ivec3(gy, y4.b, gx), v[0].b);
187            image3d_st1(top_blob_3d, ivec3(gy, y4.a, gx), v[0].a);
188            image3d_st1(top_blob_3d, ivec3(gy, yy4.r, gx), v[1].r);
189            image3d_st1(top_blob_3d, ivec3(gy, yy4.g, gx), v[1].g);
190            image3d_st1(top_blob_3d, ivec3(gy, yy4.b, gx), v[1].b);
191            image3d_st1(top_blob_3d, ivec3(gy, yy4.a, gx), v[1].a);
192        }
193        if (order_type == 5)
194        {
195            ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3);
196            ivec4 xx4 = x4 + 4;
197
198            image3d_st1(top_blob_3d, ivec3(x4.r, gy, gx), v[0].r);
199            image3d_st1(top_blob_3d, ivec3(x4.g, gy, gx), v[0].g);
200            image3d_st1(top_blob_3d, ivec3(x4.b, gy, gx), v[0].b);
201            image3d_st1(top_blob_3d, ivec3(x4.a, gy, gx), v[0].a);
202            image3d_st1(top_blob_3d, ivec3(xx4.r, gy, gx), v[1].r);
203            image3d_st1(top_blob_3d, ivec3(xx4.g, gy, gx), v[1].g);
204            image3d_st1(top_blob_3d, ivec3(xx4.b, gy, gx), v[1].b);
205            image3d_st1(top_blob_3d, ivec3(xx4.a, gy, gx), v[1].a);
206        }
207    }
208#else
209    ivec4 v_offset;
210    ivec4 vv_offset;
211
212    if (psc(dims) == 2)
213    {
214        // order_type
215        // 0 = w h
216        // 1 = h w
217
218        if (order_type == 0)
219        {
220            v_offset = ivec4((gy * 8) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw);
221            vv_offset = v_offset + 4 * psc(outw);
222        }
223        if (order_type == 1)
224        {
225            v_offset = ivec4(gx * psc(outw) + gy * 8) + ivec4(0, 1, 2, 3);
226            vv_offset = v_offset + 4;
227        }
228    }
229    else // if (psc(dims) == 3)
230    {
231        // order_type
232        // 0 = w h c
233        // 1 = h w c
234        // 2 = w c h
235        // 3 = c w h
236        // 4 = h c w
237        // 5 = c h w
238
239        if (order_type == 0)
240        {
241            v_offset = ivec4((gz * 8) * psc(outcstep) + gy * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outcstep);
242            vv_offset = v_offset + 4 * psc(outcstep);
243        }
244        if (order_type == 1)
245        {
246            v_offset = ivec4((gz * 8) * psc(outcstep) + gx * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outcstep);
247            vv_offset = v_offset + 4 * psc(outcstep);
248        }
249        if (order_type == 2)
250        {
251            v_offset = ivec4(gy * psc(outcstep) + (gz * 8) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw);
252            vv_offset = v_offset + 4 * psc(outw);
253        }
254        if (order_type == 3)
255        {
256            v_offset = ivec4(gy * psc(outcstep) + gx * psc(outw) + gz * 8) + ivec4(0, 1, 2, 3);
257            vv_offset = v_offset + 4;
258        }
259        if (order_type == 4)
260        {
261            v_offset = ivec4(gx * psc(outcstep) + (gz * 8) * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outw);
262            vv_offset = v_offset + 4 * psc(outw);
263        }
264        if (order_type == 5)
265        {
266            v_offset = ivec4(gx * psc(outcstep) + gy * psc(outw) + gz * 8) + ivec4(0, 1, 2, 3);
267            vv_offset = v_offset + 4;
268        }
269    }
270
271    int gi = gz * psc(cstep) + gy * psc(w) + gx;
272
273    buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi);
274#endif
275}
276