1struct BufferCopy {
2    uint4 SrcDst;
3};
4
5struct ImageCopy {
6    uint4 Src;
7    uint4 Dst;
8};
9
10struct BufferImageCopy {
11    // x=offset, yz=size
12    uint4 BufferVars;
13    uint4 ImageOffset;
14    uint4 ImageExtent;
15    uint4 ImageSize;
16};
17
18cbuffer CopyConstants : register(b0) {
19    BufferCopy BufferCopies;
20    ImageCopy ImageCopies;
21    BufferImageCopy BufferImageCopies;
22};
23
24
25uint3 GetDestBounds()
26{
27    return min(
28        BufferImageCopies.ImageOffset + BufferImageCopies.ImageExtent,
29        BufferImageCopies.ImageSize
30    );
31}
32
33uint3 GetImageCopyDst(uint3 dispatch_thread_id)
34{
35    return uint3(ImageCopies.Dst.xy + dispatch_thread_id.xy, ImageCopies.Dst.z);
36}
37
38uint3 GetImageCopySrc(uint3 dispatch_thread_id)
39{
40    return uint3(ImageCopies.Src.xy + dispatch_thread_id.xy, ImageCopies.Src.z);
41}
42
43uint3 GetImageDst(uint3 dispatch_thread_id)
44{
45    return uint3(BufferImageCopies.ImageOffset.xy + dispatch_thread_id.xy, BufferImageCopies.ImageOffset.z);
46}
47
48uint3 GetImageSrc(uint3 dispatch_thread_id)
49{
50    return uint3(BufferImageCopies.ImageOffset.xy + dispatch_thread_id.xy, BufferImageCopies.ImageOffset.z);
51}
52
53uint GetBufferDst128(uint3 dispatch_thread_id)
54{
55    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 16 + dispatch_thread_id.y * 16 * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
56}
57uint GetBufferSrc128(uint3 dispatch_thread_id)
58{
59    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 16 + dispatch_thread_id.y * 16 * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
60}
61
62uint GetBufferDst64(uint3 dispatch_thread_id)
63{
64    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 8 + dispatch_thread_id.y * 8 * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
65}
66uint GetBufferSrc64(uint3 dispatch_thread_id)
67{
68    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 8 + dispatch_thread_id.y * 8 * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
69}
70
71uint GetBufferDst32(uint3 dispatch_thread_id)
72{
73    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 4 + dispatch_thread_id.y * 4 * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
74}
75uint GetBufferSrc32(uint3 dispatch_thread_id)
76{
77    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 4 + dispatch_thread_id.y * 4 * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
78}
79
80uint GetBufferDst16(uint3 dispatch_thread_id)
81{
82    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 4 + dispatch_thread_id.y * 2 * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
83}
84uint GetBufferSrc16(uint3 dispatch_thread_id)
85{
86    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 4 + dispatch_thread_id.y * 2 * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
87}
88
89uint GetBufferDst8(uint3 dispatch_thread_id)
90{
91    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 4 + dispatch_thread_id.y * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
92}
93uint GetBufferSrc8(uint3 dispatch_thread_id)
94{
95    return BufferImageCopies.BufferVars.x + dispatch_thread_id.x * 4 + dispatch_thread_id.y * max(BufferImageCopies.BufferVars.y, BufferImageCopies.ImageExtent.x);
96}
97
98
99uint4 Uint32ToUint8x4(uint data)
100{
101    return (data >> uint4(0, 8, 16, 24)) & 0xFF;
102}
103
104uint2 Uint32ToUint16x2(uint data)
105{
106    return (data >> uint2(0, 16)) & 0xFFFF;
107}
108
109uint Uint8x4ToUint32(uint4 data)
110{
111    return dot(min(data, 0xFF), 1 << uint4(0, 8, 16, 24));
112}
113
114uint Uint16x2ToUint32(uint2 data)
115{
116    return dot(min(data, 0xFFFF), 1 << uint2(0, 16));
117}
118
119uint2 Uint16ToUint8x2(uint data)
120{
121    return (data >> uint2(0, 8)) & 0xFF;
122}
123
124uint Uint8x2ToUint16(uint2 data)
125{
126    return dot(min(data, 0xFF), 1 << uint2(0, 8));
127}
128
129uint4 Float4ToUint8x4(float4 data)
130{
131    return uint4(data * 255 + .5f);
132}
133
134// Buffers are always R32-aligned
135ByteAddressBuffer   BufferCopySrc : register(t0);
136RWByteAddressBuffer BufferCopyDst : register(u0);
137
138Texture2DArray<uint4>   ImageCopySrc     : register(t0);
139RWTexture2DArray<uint>  ImageCopyDstR    : register(u0);
140RWTexture2DArray<uint2> ImageCopyDstRg   : register(u0);
141RWTexture2DArray<uint4> ImageCopyDstRgba : register(u0);
142
143Texture2DArray<float4>  ImageCopySrcBgra : register(t0);
144
145// Image<->Image copies
146[numthreads(1, 1, 1)]
147void cs_copy_image2d_r8g8_image2d_r16(uint3 dispatch_thread_id : SV_DispatchThreadID)
148{
149    uint3 dst_idx = GetImageCopyDst(dispatch_thread_id);
150    uint3 src_idx = GetImageCopySrc(dispatch_thread_id);
151
152    ImageCopyDstR[dst_idx] = Uint8x2ToUint16(ImageCopySrc[src_idx]);
153}
154
155[numthreads(1, 1, 1)]
156void cs_copy_image2d_r16_image2d_r8g8(uint3 dispatch_thread_id : SV_DispatchThreadID)
157{
158    uint3 dst_idx = GetImageCopyDst(dispatch_thread_id);
159    uint3 src_idx = GetImageCopySrc(dispatch_thread_id);
160
161    ImageCopyDstRg[dst_idx] = Uint16ToUint8x2(ImageCopySrc[src_idx]);
162}
163
164[numthreads(1, 1, 1)]
165void cs_copy_image2d_r8g8b8a8_image2d_r32(uint3 dispatch_thread_id : SV_DispatchThreadID)
166{
167    uint3 dst_idx = GetImageCopyDst(dispatch_thread_id);
168    uint3 src_idx = GetImageCopySrc(dispatch_thread_id);
169
170    ImageCopyDstR[dst_idx] = Uint8x4ToUint32(ImageCopySrc[src_idx]);
171}
172
173[numthreads(1, 1, 1)]
174void cs_copy_image2d_r8g8b8a8_image2d_r16g16(uint3 dispatch_thread_id : SV_DispatchThreadID)
175{
176    uint3 dst_idx = GetImageCopyDst(dispatch_thread_id);
177    uint3 src_idx = GetImageCopySrc(dispatch_thread_id);
178
179    ImageCopyDstRg[dst_idx] = Uint32ToUint16x2(Uint8x4ToUint32(ImageCopySrc[src_idx]));
180}
181
182[numthreads(1, 1, 1)]
183void cs_copy_image2d_r16g16_image2d_r32(uint3 dispatch_thread_id : SV_DispatchThreadID)
184{
185    uint3 dst_idx = GetImageCopyDst(dispatch_thread_id);
186    uint3 src_idx = GetImageCopySrc(dispatch_thread_id);
187
188    ImageCopyDstR[dst_idx] = Uint16x2ToUint32(ImageCopySrc[src_idx]);
189}
190
191[numthreads(1, 1, 1)]
192void cs_copy_image2d_r16g16_image2d_r8g8b8a8(uint3 dispatch_thread_id : SV_DispatchThreadID)
193{
194    uint3 dst_idx = GetImageCopyDst(dispatch_thread_id);
195    uint3 src_idx = GetImageCopySrc(dispatch_thread_id);
196
197    ImageCopyDstRgba[dst_idx] = Uint32ToUint8x4(Uint16x2ToUint32(ImageCopySrc[src_idx]));
198}
199
200[numthreads(1, 1, 1)]
201void cs_copy_image2d_r32_image2d_r16g16(uint3 dispatch_thread_id : SV_DispatchThreadID)
202{
203    uint3 dst_idx = GetImageCopyDst(dispatch_thread_id);
204    uint3 src_idx = GetImageCopySrc(dispatch_thread_id);
205
206    ImageCopyDstRg[dst_idx] = Uint32ToUint16x2(ImageCopySrc[src_idx]);
207}
208
209[numthreads(1, 1, 1)]
210void cs_copy_image2d_r32_image2d_r8g8b8a8(uint3 dispatch_thread_id : SV_DispatchThreadID)
211{
212    uint3 dst_idx = GetImageCopyDst(dispatch_thread_id);
213    uint3 src_idx = GetImageCopySrc(dispatch_thread_id);
214
215    ImageCopyDstRgba[dst_idx] = Uint32ToUint8x4(ImageCopySrc[src_idx]);
216}
217
218#define COPY_NUM_THREAD_X 8
219#define COPY_NUM_THREAD_Y 8
220
221// Buffer<->Image copies
222
223// R32G32B32A32
224[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
225void cs_copy_buffer_image2d_r32g32b32a32(uint3 dispatch_thread_id : SV_DispatchThreadID) {
226    uint3 dst_idx = GetImageDst(dispatch_thread_id);
227    uint3 bounds = GetDestBounds();
228    if (dst_idx.x >= bounds.x || dst_idx.y >= bounds.y) {
229        return;
230    }
231
232    uint src_idx = GetBufferSrc128(dispatch_thread_id);
233
234    ImageCopyDstRgba[dst_idx] = uint4(
235        BufferCopySrc.Load(src_idx),
236        BufferCopySrc.Load(src_idx + 1 * 4),
237        BufferCopySrc.Load(src_idx + 2 * 4),
238        BufferCopySrc.Load(src_idx + 3 * 4)
239    );
240}
241
242[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
243void cs_copy_image2d_r32g32b32a32_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
244    uint3 src_idx = GetImageSrc(dispatch_thread_id);
245    uint3 bounds = GetDestBounds();
246    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
247        return;
248    }
249
250    uint4 data = ImageCopySrc[src_idx];
251    uint dst_idx = GetBufferDst128(dispatch_thread_id);
252
253    BufferCopyDst.Store(dst_idx,         data.x);
254    BufferCopyDst.Store(dst_idx + 1 * 4, data.y);
255    BufferCopyDst.Store(dst_idx + 2 * 4, data.z);
256    BufferCopyDst.Store(dst_idx + 3 * 4, data.w);
257}
258
259// R32G32
260[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
261void cs_copy_buffer_image2d_r32g32(uint3 dispatch_thread_id : SV_DispatchThreadID) {
262    uint3 dst_idx = GetImageDst(dispatch_thread_id);
263    uint3 bounds = GetDestBounds();
264    if (dst_idx.x >= bounds.x || dst_idx.y >= bounds.y) {
265        return;
266    }
267
268    uint src_idx = GetBufferSrc64(dispatch_thread_id);
269
270    ImageCopyDstRg[dst_idx] = uint2(
271        BufferCopySrc.Load(src_idx),
272        BufferCopySrc.Load(src_idx + 1 * 4)
273    );
274}
275
276[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
277void cs_copy_image2d_r32g32_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
278    uint3 src_idx = GetImageSrc(dispatch_thread_id);
279    uint3 bounds = GetDestBounds();
280    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
281        return;
282    }
283
284    uint2 data = ImageCopySrc[src_idx].rg;
285    uint dst_idx = GetBufferDst64(dispatch_thread_id);
286
287    BufferCopyDst.Store(dst_idx        , data.x);
288    BufferCopyDst.Store(dst_idx + 1 * 4, data.y);
289}
290
291// R16G16B16A16
292[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
293void cs_copy_buffer_image2d_r16g16b16a16(uint3 dispatch_thread_id : SV_DispatchThreadID) {
294    uint3 dst_idx = GetImageDst(dispatch_thread_id);
295    uint3 bounds = GetDestBounds();
296    if (dst_idx.x >= bounds.x || dst_idx.y >= bounds.y) {
297        return;
298    }
299
300    uint src_idx = GetBufferSrc64(dispatch_thread_id);
301
302    ImageCopyDstRgba[dst_idx] = uint4(
303        Uint32ToUint16x2(BufferCopySrc.Load(src_idx)),
304        Uint32ToUint16x2(BufferCopySrc.Load(src_idx + 1 * 4))
305    );
306}
307
308[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
309void cs_copy_image2d_r16g16b16a16_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
310    uint3 src_idx = GetImageSrc(dispatch_thread_id);
311    uint3 bounds = GetDestBounds();
312    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
313        return;
314    }
315
316    uint4 data = ImageCopySrc[src_idx];
317    uint dst_idx = GetBufferDst64(dispatch_thread_id);
318
319    BufferCopyDst.Store(dst_idx,         Uint16x2ToUint32(data.xy));
320    BufferCopyDst.Store(dst_idx + 1 * 4, Uint16x2ToUint32(data.zw));
321}
322
323// R32
324[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
325void cs_copy_buffer_image2d_r32(uint3 dispatch_thread_id : SV_DispatchThreadID) {
326    uint3 dst_idx = GetImageDst(dispatch_thread_id);
327    uint3 bounds = GetDestBounds();
328    if (dst_idx.x >= bounds.x || dst_idx.y >= bounds.y) {
329        return;
330    }
331
332    uint src_idx = GetBufferSrc32(dispatch_thread_id);
333
334    ImageCopyDstR[dst_idx] = BufferCopySrc.Load(src_idx);
335}
336
337[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
338void cs_copy_image2d_r32_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
339    uint3 src_idx = GetImageSrc(dispatch_thread_id);
340    uint3 bounds = GetDestBounds();
341    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
342        return;
343    }
344
345    uint dst_idx = GetBufferDst32(dispatch_thread_id);
346
347    BufferCopyDst.Store(dst_idx, ImageCopySrc[src_idx].r);
348}
349
350// R16G16
351[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
352void cs_copy_buffer_image2d_r16g16(uint3 dispatch_thread_id : SV_DispatchThreadID) {
353    uint3 dst_idx = GetImageDst(dispatch_thread_id);
354    uint3 bounds = GetDestBounds();
355    if (dst_idx.x >= bounds.x || dst_idx.y >= bounds.y) {
356        return;
357    }
358
359    uint src_idx = GetBufferSrc32(dispatch_thread_id);
360
361    ImageCopyDstRg[dst_idx] = Uint32ToUint16x2(BufferCopySrc.Load(src_idx));
362}
363
364[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
365void cs_copy_image2d_r16g16_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
366    uint3 src_idx = GetImageSrc(dispatch_thread_id);
367    uint3 bounds = GetDestBounds();
368    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
369        return;
370    }
371
372    uint dst_idx = GetBufferDst32(dispatch_thread_id);
373
374    BufferCopyDst.Store(dst_idx, Uint16x2ToUint32(ImageCopySrc[src_idx].xy));
375}
376
377// R8G8B8A8
378[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
379void cs_copy_buffer_image2d_r8g8b8a8(uint3 dispatch_thread_id : SV_DispatchThreadID) {
380    uint3 dst_idx = GetImageDst(dispatch_thread_id);
381    uint3 bounds = GetDestBounds();
382    if (dst_idx.x >= bounds.x || dst_idx.y >= bounds.y) {
383        return;
384    }
385
386    uint src_idx = GetBufferSrc32(dispatch_thread_id);
387
388    ImageCopyDstRgba[dst_idx] = Uint32ToUint8x4(BufferCopySrc.Load(src_idx));
389}
390
391[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
392void cs_copy_image2d_r8g8b8a8_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
393    uint3 src_idx = GetImageSrc(dispatch_thread_id);
394    uint3 bounds = GetDestBounds();
395    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
396        return;
397    }
398
399    uint dst_idx = GetBufferDst32(dispatch_thread_id);
400
401    BufferCopyDst.Store(dst_idx, Uint8x4ToUint32(ImageCopySrc[src_idx]));
402}
403
404// B8G8R8A8
405[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
406void cs_copy_image2d_b8g8r8a8_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
407    uint3 src_idx = GetImageSrc(dispatch_thread_id);
408    uint3 bounds = GetDestBounds();
409    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
410        return;
411    }
412
413    uint dst_idx = GetBufferDst32(dispatch_thread_id);
414
415    BufferCopyDst.Store(dst_idx, Uint8x4ToUint32(Float4ToUint8x4(ImageCopySrcBgra[src_idx].bgra)));
416}
417
418// R16
419[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
420void cs_copy_buffer_image2d_r16(uint3 dispatch_thread_id : SV_DispatchThreadID) {
421    uint3 dst_idx = GetImageDst(uint3(2, 1, 0) * dispatch_thread_id);
422    uint3 bounds = GetDestBounds();
423    if (dst_idx.x >= bounds.x || dst_idx.y >= bounds.y) {
424        return;
425    }
426
427    uint src_idx = GetBufferSrc16(dispatch_thread_id);
428    uint2 data = Uint32ToUint16x2(BufferCopySrc.Load(src_idx));
429
430    ImageCopyDstR[dst_idx                 ] = data.x;
431    ImageCopyDstR[dst_idx + uint3(1, 0, 0)] = data.y;
432}
433
434[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
435void cs_copy_image2d_r16_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
436    uint3 src_idx = GetImageSrc(uint3(2, 1, 0) * dispatch_thread_id);
437    uint3 bounds = GetDestBounds();
438    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
439        return;
440    }
441
442    uint dst_idx = GetBufferDst16(dispatch_thread_id);
443
444    uint upper = ImageCopySrc[src_idx].r;
445    uint lower = ImageCopySrc[src_idx + uint3(1, 0, 0)].r;
446
447    BufferCopyDst.Store(dst_idx, Uint16x2ToUint32(uint2(upper, lower)));
448}
449
450// R8G8
451[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
452void cs_copy_buffer_image2d_r8g8(uint3 dispatch_thread_id : SV_DispatchThreadID) {
453    uint3 dst_idx = GetImageDst(uint3(2, 1, 0) * dispatch_thread_id);
454    uint3 bounds = GetDestBounds();
455    if (dst_idx.x >= bounds.x || dst_idx.y >= bounds.y) {
456        return;
457    }
458
459    uint src_idx = GetBufferSrc16(dispatch_thread_id);
460
461    uint4 data = Uint32ToUint8x4(BufferCopySrc.Load(src_idx));
462
463    ImageCopyDstRg[dst_idx                 ] = data.xy;
464    ImageCopyDstRg[dst_idx + uint3(1, 0, 0)] = data.zw;
465}
466
467[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
468void cs_copy_image2d_r8g8_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
469    uint3 src_idx = GetImageSrc(uint3(2, 1, 0) * dispatch_thread_id);
470    uint3 bounds = GetDestBounds();
471    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
472        return;
473    }
474
475    uint dst_idx = GetBufferDst16(dispatch_thread_id);
476
477    uint2 lower = ImageCopySrc[src_idx].xy;
478    uint2 upper = ImageCopySrc[src_idx + uint3(1, 0, 0)].xy;
479
480    BufferCopyDst.Store(dst_idx, Uint8x4ToUint32(uint4(lower.x, lower.y, upper.x, upper.y)));
481}
482
483// R8
484[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
485void cs_copy_buffer_image2d_r8(uint3 dispatch_thread_id : SV_DispatchThreadID) {
486    uint3 dst_idx = GetImageDst(uint3(4, 1, 0) * dispatch_thread_id);
487    uint3 bounds = GetDestBounds();
488    if (dst_idx.x >= bounds.x || dst_idx.y >= bounds.y) {
489        return;
490    }
491
492    uint src_idx = GetBufferSrc8(dispatch_thread_id);
493    uint4 data = Uint32ToUint8x4(BufferCopySrc.Load(src_idx));
494
495    ImageCopyDstR[dst_idx              ] = data.x;
496    ImageCopyDstR[dst_idx + uint3(1, 0, 0)] = data.y;
497    ImageCopyDstR[dst_idx + uint3(2, 0, 0)] = data.z;
498    ImageCopyDstR[dst_idx + uint3(3, 0, 0)] = data.w;
499}
500
501[numthreads(COPY_NUM_THREAD_X, COPY_NUM_THREAD_Y, 1)]
502void cs_copy_image2d_r8_buffer(uint3 dispatch_thread_id : SV_DispatchThreadID) {
503    uint3 src_idx = GetImageSrc(uint3(4, 1, 0) * dispatch_thread_id);
504    uint3 bounds = GetDestBounds();
505    if (src_idx.x >= bounds.x || src_idx.y >= bounds.y) {
506        return;
507    }
508
509    uint dst_idx = GetBufferDst8(dispatch_thread_id);
510
511    BufferCopyDst.Store(dst_idx, Uint8x4ToUint32(uint4(
512        ImageCopySrc[src_idx].r,
513        ImageCopySrc[src_idx + uint3(1, 0, 0)].r,
514        ImageCopySrc[src_idx + uint3(2, 0, 0)].r,
515        ImageCopySrc[src_idx + uint3(3, 0, 0)].r
516    )));
517}
518