1 /*==============================================================================
2 Copyright(c) 2017 Intel Corporation
3 
4 Permission is hereby granted, free of charge, to any person obtaining a
5 copy of this software and associated documentation files(the "Software"),
6 to deal in the Software without restriction, including without limitation
7 the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 and / or sell copies of the Software, and to permit persons to whom the
9 Software is furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included
12 in all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 OTHER DEALINGS IN THE SOFTWARE.
21 ============================================================================*/
22 // clang-format off
23 // CpuSwizzleBlt.c - Surface swizzling definitions and BLT functionality.
24 
25 // [!] File serves as its own header:
26 //      #define INCLUDE_CpuSwizzleBlt_c_AS_HEADER
27 //      #include "CpuSwizzleBlt.c"
28 
29 #define SUB_ELEMENT_SUPPORT         // Support for Partial Element Transfer (e.g. separating/merging depth-stencil).
30 #define INTEL_TILE_W_SUPPORT        // Stencil Only;
31 
32 #ifndef CpuSwizzleBlt_INCLUDED
33 
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37 
38 // Background ##################################################################
39 
40 /* Pixel-based surfaces commonly stored in memory row-by-row. This convention
41 has simple "y * Pitch + x" addressing but has spatial locality only in
42 horizontal direction--i.e. horizontal pixel neighbors stored next to each other
43 but vertical neighbors stored entire pitch away.
44 
45 Since many graphics operations involve multi-dimensional data access, to
46 improve cache/memory access performance it is often more beneficial to use
47 alternative storage conventions which have multi-dimensional spatial locality--
48 i.e. where pixels tend to be stored near both their horizontal and vertical
49 neighbors.
50 
51 "Tiling/Swizzling" is storage convention that increases multi-dimensional
52 spatial locality by treating surface as series of smaller regions/"tiles",
53 laid out in row-major order across surface, with entire content of each tile
54 stored contiguously. Data within each tile is stored in pattern that further
55 maximizes the locality. */
56 
57 
58 // Swizzle Descriptors #########################################################
59 
60 /* Tile sizes always powers of 2 and chosen to be architecturally convenient--
61 e.g. 4KB to match physical page size. Tile dimensions also powers of 2, usually
62 chosen to produce square tiles for targeted pixel size--e.g. 4KB = 128 bytes x
63 32 rows = 32 x 32 pixels @ 4 bytes-per-pixel.
64 
65 Since tile size and dimensions all powers of two, the spatial-to-linear mapping
66 required to store a tile can be trivial: spatial indexing bits can simply be
67 mapped to linear offset bits--e.g. for a 4KB, 128x32 tile...each byte within
68 tile can be referenced with a 7-bit X index and 5-bit Y index--and each of
69 those 12 index bits can be individually mapped to a bit in the 12-bit offset of
70 the tile's linear storage.
71 
72 The order in which spatial index bits are mapped to linear offset bits
73 determines the spatial locality properties of the surface data. E.g. the
74 following mapping...
75 
76     Linear[11:0] = Y4 Y3 Y2 Y1 Y0 X6 X5 X4 X3 X2 X1 X0
77                    \-- Y[4:0] --/ \----- X[6:0] -----/
78 
79 ...stores bytes of tile in row-major order, with horizontal neighbors stored
80 contiguously and vertical neighbors stored 128 bytes away. If instead, Y index
81 bits were mapped to the low-order...
82 
83     Linear[11:0] = X6 X5 X4 X3 X2 X1 X0 Y4 Y3 Y2 Y1 Y0
84                    \----- X[6:0] -----/ \-- Y[4:0] --/
85 
86 ...bytes of tile would be stored in column-major order, with vertical neighbors
87 stored contiguously and horizontal neighbors stored 32 bytes away.
88 
89 Individual X and Y bits can be separated and interspersed in mapping to
90 increase locality via sub-tiling--e.g...
91 
92     Linear[11:0] = Y4 Y3 Y2 X6 X5 X4 Y1 Y0 X3 X2 X1 X0
93                                      \-- Sub-Tile ---/
94 
95 ...subdivies tile into 16x4 sub-tiles laid out in row-major order across tile,
96 with sub-tile content further stored in row-major order, with horizontal byte
97 neighbors within sub-tile stored contiguously and vertical neighbors only 16
98 bytes away. This means single 64-byte cache line contains 4x4 group of 32bpp
99 pixels--which is powerful spatial locality for graphics processing.
100 
101 If mappings restricted to being "parallel" for index bits (i.e. bits of given
102 index can change position but not relative order during mapping), then bit
103 indexes need not be explicitly denoted--e.g. the previous sub-tiling mapping
104 can be represented as...
105 
106     Linear[11:0] = Y Y Y X X X Y Y X X X X
107 
108 ...where X and Y index bits are implied to be zero-based-counted in order they
109 are encountered.
110 
111 In software, spatial-to-linear mapping conveniently described with bit mask for
112 each dimension, where a set bit indicates the next bit of that dimension's
113 index is mapped to that position in the linear offset--e.g....
114 
115     Linear[11:0] = Y Y Y X X X Y Y X X X X
116     MaskX =        0 0 0 1 1 1 0 0 1 1 1 1
117     MaskY =        1 1 1 0 0 0 1 1 0 0 0 0
118 
119 Such dimensional masks all that's needed to describe given tiling/swizzling
120 convention, since tile size and dimensions can be derived from the masks:
121 
122     TileWidth =  2 ^ NumberOfSetBits(MaskX)
123     TileHeight = 2 ^ NumberOfSetBits(MaskY)
124     TileSize =   2 ^ NumberOfSetBits(MaskX OR MaskY)
125 
126 Tiling/swizzling is not limited to 2D. With addition of another tile dimension,
127 spatial locality for 3D or MSAA sample neighbors can be controlled, also. */
128 
129     typedef struct  _SWIZZLE_DESCRIPTOR {
130         struct          _SWIZZLE_DESCRIPTOR_MASKS {
131             int             x, y, z;
132         }               Mask;
133     }               SWIZZLE_DESCRIPTOR;
134 
135     // Definition Helper Macros...
136     #define X ,'x'
137     #define Y ,'y'
138     #define Z ,'z'
139     #define S ,'z' // S = MSAA Sample Index
140     #define o ,0   // o = N/A Swizzle Bit
141     #ifdef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
142         #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
143             extern const SWIZZLE_DESCRIPTOR Name;
144     #else // C Compile...
145         #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
146             const SWIZZLE_DESCRIPTOR Name = \
147                 { (b15 == 'x' ? 0x8000 : 0) + (b14 == 'x' ? 0x4000 : 0) + (b13 == 'x' ? 0x2000 : 0) + (b12 == 'x' ? 0x1000 : 0) + (b11 == 'x' ? 0x0800 : 0) + (b10 == 'x' ? 0x0400 : 0) + (b9 == 'x' ? 0x0200 : 0) + (b8 == 'x' ? 0x0100 : 0) + (b7 == 'x' ? 0x0080 : 0) + (b6 == 'x' ? 0x0040 : 0) + (b5 == 'x' ? 0x0020 : 0) + (b4 == 'x' ? 0x0010 : 0) + (b3 == 'x' ? 0x0008 : 0) + (b2 == 'x' ? 0x0004 : 0) + (b1 == 'x' ? 0x0002 : 0) + (b0 == 'x' ? 0x0001 : 0), \
148                   (b15 == 'y' ? 0x8000 : 0) + (b14 == 'y' ? 0x4000 : 0) + (b13 == 'y' ? 0x2000 : 0) + (b12 == 'y' ? 0x1000 : 0) + (b11 == 'y' ? 0x0800 : 0) + (b10 == 'y' ? 0x0400 : 0) + (b9 == 'y' ? 0x0200 : 0) + (b8 == 'y' ? 0x0100 : 0) + (b7 == 'y' ? 0x0080 : 0) + (b6 == 'y' ? 0x0040 : 0) + (b5 == 'y' ? 0x0020 : 0) + (b4 == 'y' ? 0x0010 : 0) + (b3 == 'y' ? 0x0008 : 0) + (b2 == 'y' ? 0x0004 : 0) + (b1 == 'y' ? 0x0002 : 0) + (b0 == 'y' ? 0x0001 : 0), \
149                   (b15 == 'z' ? 0x8000 : 0) + (b14 == 'z' ? 0x4000 : 0) + (b13 == 'z' ? 0x2000 : 0) + (b12 == 'z' ? 0x1000 : 0) + (b11 == 'z' ? 0x0800 : 0) + (b10 == 'z' ? 0x0400 : 0) + (b9 == 'z' ? 0x0200 : 0) + (b8 == 'z' ? 0x0100 : 0) + (b7 == 'z' ? 0x0080 : 0) + (b6 == 'z' ? 0x0040 : 0) + (b5 == 'z' ? 0x0020 : 0) + (b4 == 'z' ? 0x0010 : 0) + (b3 == 'z' ? 0x0008 : 0) + (b2 == 'z' ? 0x0004 : 0) + (b1 == 'z' ? 0x0002 : 0) + (b0 == 'z' ? 0x0001 : 0) }
150 #endif
151     #define SWIZZLE(__SWIZZLE_Args) __SWIZZLE __SWIZZLE_Args
152 
153     // Legacy Intel Tiling Swizzles...
154     SWIZZLE(( INTEL_TILE_X              o o o o Y Y Y X X X X X X X X X ));
155     SWIZZLE(( INTEL_TILE_Y              o o o o X X X Y Y Y Y Y X X X X ));
156 
157     #ifdef INTEL_TILE_W_SUPPORT
158         SWIZZLE(( INTEL_TILE_W          o o o o X X X Y Y Y Y X Y X Y X ));
159     #endif
160 // Gen9 Swizzles...
161     SWIZZLE(( INTEL_TILE_YF_128         o o o o X Y X Y X X Y Y X X X X ));
162     SWIZZLE(( INTEL_TILE_YF_64          o o o o X Y X Y X X Y Y X X X X ));
163     SWIZZLE(( INTEL_TILE_YF_32          o o o o X Y X Y X Y Y Y X X X X ));
164     SWIZZLE(( INTEL_TILE_YF_16          o o o o X Y X Y X Y Y Y X X X X ));
165     SWIZZLE(( INTEL_TILE_YF_8           o o o o X Y X Y Y Y Y Y X X X X ));
166 
167     SWIZZLE(( INTEL_TILE_YS_128         X Y X Y X Y X Y X X Y Y X X X X ));
168     SWIZZLE(( INTEL_TILE_YS_64          X Y X Y X Y X Y X X Y Y X X X X ));
169     SWIZZLE(( INTEL_TILE_YS_32          X Y X Y X Y X Y X Y Y Y X X X X ));
170     SWIZZLE(( INTEL_TILE_YS_16          X Y X Y X Y X Y X Y Y Y X X X X ));
171     SWIZZLE(( INTEL_TILE_YS_8           X Y X Y X Y X Y Y Y Y Y X X X X ));
172 
173     SWIZZLE(( INTEL_TILE_YF_MSAA2_128   o o o o S Y X Y X X Y Y X X X X ));
174     SWIZZLE(( INTEL_TILE_YF_MSAA2_64    o o o o S Y X Y X X Y Y X X X X ));
175     SWIZZLE(( INTEL_TILE_YF_MSAA2_32    o o o o S Y X Y X Y Y Y X X X X ));
176     SWIZZLE(( INTEL_TILE_YF_MSAA2_16    o o o o S Y X Y X Y Y Y X X X X ));
177     SWIZZLE(( INTEL_TILE_YF_MSAA2_8     o o o o S Y X Y Y Y Y Y X X X X ));
178 
179     SWIZZLE(( INTEL_TILE_YS_MSAA2_128   S Y X Y X Y X Y X X Y Y X X X X ));
180     SWIZZLE(( INTEL_TILE_YS_MSAA2_64    S Y X Y X Y X Y X X Y Y X X X X ));
181     SWIZZLE(( INTEL_TILE_YS_MSAA2_32    S Y X Y X Y X Y X Y Y Y X X X X ));
182     SWIZZLE(( INTEL_TILE_YS_MSAA2_16    S Y X Y X Y X Y X Y Y Y X X X X ));
183     SWIZZLE(( INTEL_TILE_YS_MSAA2_8     S Y X Y X Y X Y Y Y Y Y X X X X ));
184 
185     SWIZZLE(( INTEL_TILE_YF_MSAA4_128   o o o o S S X Y X X Y Y X X X X ));
186     SWIZZLE(( INTEL_TILE_YF_MSAA4_64    o o o o S S X Y X X Y Y X X X X ));
187     SWIZZLE(( INTEL_TILE_YF_MSAA4_32    o o o o S S X Y X Y Y Y X X X X ));
188     SWIZZLE(( INTEL_TILE_YF_MSAA4_16    o o o o S S X Y X Y Y Y X X X X ));
189     SWIZZLE(( INTEL_TILE_YF_MSAA4_8     o o o o S S X Y Y Y Y Y X X X X ));
190 
191     SWIZZLE(( INTEL_TILE_YS_MSAA4_128   S S X Y X Y X Y X X Y Y X X X X ));
192     SWIZZLE(( INTEL_TILE_YS_MSAA4_64    S S X Y X Y X Y X X Y Y X X X X ));
193     SWIZZLE(( INTEL_TILE_YS_MSAA4_32    S S X Y X Y X Y X Y Y Y X X X X ));
194     SWIZZLE(( INTEL_TILE_YS_MSAA4_16    S S X Y X Y X Y X Y Y Y X X X X ));
195     SWIZZLE(( INTEL_TILE_YS_MSAA4_8     S S X Y X Y X Y Y Y Y Y X X X X ));
196 
197     SWIZZLE(( INTEL_TILE_YF_MSAA8_128   o o o o S S S Y X X Y Y X X X X ));
198     SWIZZLE(( INTEL_TILE_YF_MSAA8_64    o o o o S S S Y X X Y Y X X X X ));
199     SWIZZLE(( INTEL_TILE_YF_MSAA8_32    o o o o S S S Y X Y Y Y X X X X ));
200     SWIZZLE(( INTEL_TILE_YF_MSAA8_16    o o o o S S S Y X Y Y Y X X X X ));
201     SWIZZLE(( INTEL_TILE_YF_MSAA8_8     o o o o S S S Y Y Y Y Y X X X X ));
202 
203     SWIZZLE(( INTEL_TILE_YS_MSAA8_128   S S S Y X Y X Y X X Y Y X X X X ));
204     SWIZZLE(( INTEL_TILE_YS_MSAA8_64    S S S Y X Y X Y X X Y Y X X X X ));
205     SWIZZLE(( INTEL_TILE_YS_MSAA8_32    S S S Y X Y X Y X Y Y Y X X X X ));
206     SWIZZLE(( INTEL_TILE_YS_MSAA8_16    S S S Y X Y X Y X Y Y Y X X X X ));
207     SWIZZLE(( INTEL_TILE_YS_MSAA8_8     S S S Y X Y X Y Y Y Y Y X X X X ));
208 
209     SWIZZLE(( INTEL_TILE_YF_MSAA16_128  o o o o S S S S X X Y Y X X X X ));
210     SWIZZLE(( INTEL_TILE_YF_MSAA16_64   o o o o S S S S X X Y Y X X X X ));
211     SWIZZLE(( INTEL_TILE_YF_MSAA16_32   o o o o S S S S X Y Y Y X X X X ));
212     SWIZZLE(( INTEL_TILE_YF_MSAA16_16   o o o o S S S S X Y Y Y X X X X ));
213     SWIZZLE(( INTEL_TILE_YF_MSAA16_8    o o o o S S S S Y Y Y Y X X X X ));
214 
215     SWIZZLE(( INTEL_TILE_YS_MSAA16_128  S S S S X Y X Y X X Y Y X X X X ));
216     SWIZZLE(( INTEL_TILE_YS_MSAA16_64   S S S S X Y X Y X X Y Y X X X X ));
217     SWIZZLE(( INTEL_TILE_YS_MSAA16_32   S S S S X Y X Y X Y Y Y X X X X ));
218     SWIZZLE(( INTEL_TILE_YS_MSAA16_16   S S S S X Y X Y X Y Y Y X X X X ));
219     SWIZZLE(( INTEL_TILE_YS_MSAA16_8    S S S S X Y X Y Y Y Y Y X X X X ));
220 
221     SWIZZLE(( INTEL_TILE_YF_3D_128      o o o o Y Z X X Z Z Y Y X X X X ));
222     SWIZZLE(( INTEL_TILE_YF_3D_64       o o o o Y Z X X Z Z Y Y X X X X ));
223     SWIZZLE(( INTEL_TILE_YF_3D_32       o o o o Y Z X Y Z Z Y Y X X X X ));
224     SWIZZLE(( INTEL_TILE_YF_3D_16       o o o o Y Z Y Z Z Z Y Y X X X X ));
225     SWIZZLE(( INTEL_TILE_YF_3D_8        o o o o Y Z Y Z Z Z Y Y X X X X ));
226 
227     SWIZZLE(( INTEL_TILE_YS_3D_128      X Y Z X Y Z X X Z Z Y Y X X X X ));
228     SWIZZLE(( INTEL_TILE_YS_3D_64       X Y Z X Y Z X X Z Z Y Y X X X X ));
229     SWIZZLE(( INTEL_TILE_YS_3D_32       X Y Z X Y Z X Y Z Z Y Y X X X X ));
230     SWIZZLE(( INTEL_TILE_YS_3D_16       X Y Z X Y Z Y Z Z Z Y Y X X X X ));
231     SWIZZLE(( INTEL_TILE_YS_3D_8        X Y Z X Y Z Y Z Z Z Y Y X X X X ));
232 
233     // XE_HP_SDV Swizzles...
234     SWIZZLE(( INTEL_TILE_4              o o o o Y Y X Y X X Y Y X X X X ));
235 
236     SWIZZLE(( INTEL_TILE_64_128         Y X X X Y Y X Y X X Y Y X X X X ));
237     SWIZZLE(( INTEL_TILE_64_64          Y X X X Y Y X Y X X Y Y X X X X ));
238     SWIZZLE(( INTEL_TILE_64_32          Y Y X X Y Y X Y X X Y Y X X X X ));
239     SWIZZLE(( INTEL_TILE_64_16          Y Y X X Y Y X Y X X Y Y X X X X ));
240     SWIZZLE(( INTEL_TILE_64_8           Y Y Y X Y Y X Y X X Y Y X X X X ));
241 
242     SWIZZLE(( INTEL_TILE_64_MSAA2_128   Y X X X Y Y X Y S X Y Y X X X X ));
243     SWIZZLE(( INTEL_TILE_64_MSAA2_64    Y X X X Y Y X Y S X Y Y X X X X ));
244     SWIZZLE(( INTEL_TILE_64_MSAA2_32    Y Y X X Y Y X Y S X Y Y X X X X ));
245     SWIZZLE(( INTEL_TILE_64_MSAA2_16    Y Y X X Y Y X Y S X Y Y X X X X ));
246     SWIZZLE(( INTEL_TILE_64_MSAA2_8     Y Y Y X Y Y X Y S X Y Y X X X X ));
247 
248     SWIZZLE(( INTEL_TILE_64_MSAA_128    Y X X X Y Y X S S X Y Y X X X X ));
249     SWIZZLE(( INTEL_TILE_64_MSAA_64     Y X X X Y Y X S S X Y Y X X X X ));
250     SWIZZLE(( INTEL_TILE_64_MSAA_32     Y Y X X Y Y X S S X Y Y X X X X ));
251     SWIZZLE(( INTEL_TILE_64_MSAA_16     Y Y X X Y Y X S S X Y Y X X X X ));
252     SWIZZLE(( INTEL_TILE_64_MSAA_8      Y Y Y X Y Y X S S X Y Y X X X X ));
253 
254     SWIZZLE(( INTEL_TILE_64_3D_128      Z Z Y X X X Z Y Z X Y Y X X X X ));
255     SWIZZLE(( INTEL_TILE_64_3D_64       Z Z Y X X X Z Y Z X Y Y X X X X ));
256     SWIZZLE(( INTEL_TILE_64_3D_32       Z Z Y X Y X Z Y Z X Y Y X X X X ));
257     SWIZZLE(( INTEL_TILE_64_3D_16       Z Z Z Y Y X Z Y Z X Y Y X X X X ));
258     SWIZZLE(( INTEL_TILE_64_3D_8        Z Z Z X Y Y Z Y Z X Y Y X X X X ));
259 
260     #undef X
261     #undef Y
262     #undef Z
263     #undef S
264     #undef o
265     #undef __SWIZZLE
266     #undef SWIZZLE
267 
268 // Accessing Swizzled Surface ##################################################
269 
270 /* While graphics hardware prefers to access surfaces stored in tiled/swizzled
271 formats, logically accessing such surfaces with CPU-based software is non-
272 trivial when high throughput is goal.
273 
274 This file implements (1) SwizzleOffset function to compute swizzled offset of
275 dimensionally-specified surface byte, and (2) CpuSwizzleBlt function to BLT
276 between linear ("y * pitch + x") and swizzled surfaces--with goal of providing
277 high-performance, swizzling BLT implementation to be used both in production
278 and as a guide for those seeking to understand swizzled access or implement
279 functionality beyond the simple BLT. */
280 
281 // Surface Descriptor for CpuSwizzleBlt function...
282 typedef struct _CPU_SWIZZLE_BLT_SURFACE
283 {
284     void                        *pBase;         // Pointer to surface base.
285     int                         Pitch, Height;  // Row-pitch in bytes, and height, of surface.
286     const SWIZZLE_DESCRIPTOR    *pSwizzle;      // Pointer to surface's swizzle descriptor, or NULL if unswizzled.
287     int                         OffsetX;        // Horizontal offset into surface for BLT rectangle, in bytes.
288     int                         OffsetY;        // Vertical offset into surface for BLT rectangle, in physical/pitch rows.
289     int                         OffsetZ;        // Zero if N/A, or 3D offset into surface for BLT rectangle, in 3D slices or MSAA samples as appropriate.
290 
291     #ifdef SUB_ELEMENT_SUPPORT
292         struct _CPU_SWIZZLE_BLT_SURFACE_ELEMENT
293         {
294             int                     Pitch, Size; // Zero if full-pixel BLT, or pitch and size, in bytes, of pixel element being BLT'ed.
295         }                       Element;
296 
297         /* e.g. to BLT only stencil data from S8D24 surface to S8 surface...
298             Dest.Element.Size = Src.Element.Size = sizeof(S8) = 1;
299             Dest.Element.Pitch = sizeof(S8) = 1;
300             Src.Element.Pitch = sizeof(S8D24) = 4;
301             Src.OffsetX += BYTE_OFFSET_OF_S8_WITHIN_S8D24; */
302     #endif
303 } CPU_SWIZZLE_BLT_SURFACE;
304 
305 extern int SwizzleOffset(const SWIZZLE_DESCRIPTOR *pSwizzle, int Pitch, int OffsetX, int OffsetY, int OffsetZ);
306 extern void CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE *pDest, CPU_SWIZZLE_BLT_SURFACE *pSrc, int CopyWidthBytes, int CopyHeight);
307 
308 #ifdef __cplusplus
309 }
310 #endif
311 
312 #define CpuSwizzleBlt_INCLUDED
313 
314 #endif
315 
316 
317 #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
318 
319 //#define MINIMALIST                // Use minimalist, unoptimized implementation.
320 
321 #include "assert.h" // Quoted to allow local-directory override.
322 
323 #if(_MSC_VER >= 1400)
324     #include <intrin.h>
325 #elif((defined __clang__) ||(__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
326     #include <cpuid.h>
327     #include <x86intrin.h>
328 #else
329     #error "Unexpected compiler!"
330 #endif
331 
332 
333 // POPCNT: Count Lit Bits...                 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
334 static unsigned char PopCnt4[16] =          {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
335 #define POPCNT4(x)  (PopCnt4[(x) & 0xf])
336 #define POPCNT16(x) (POPCNT4((x) >> 12) + POPCNT4((x) >> 8) + POPCNT4((x) >> 4) + POPCNT4(x))
337 
338 
SwizzleOffset(const SWIZZLE_DESCRIPTOR * pSwizzle,int Pitch,int OffsetX,int OffsetY,int OffsetZ)339 int SwizzleOffset( // ##########################################################
340 
341     /* Return swizzled offset of dimensionally-specified surface byte. */
342 
343     const SWIZZLE_DESCRIPTOR    *pSwizzle,  // Pointer to applicable swizzle descriptor.
344     int                         Pitch,      // Pointer to applicable surface row-pitch.
345     int                         OffsetX,    // Horizontal offset into surface of the target byte, in bytes.
346     int                         OffsetY,    // Vertical offset into surface of the target byte, in physical/pitch rows.
347     int                         OffsetZ)    // Zero if N/A, or 3D offset into surface of the target byte, in 3D slices or MSAA samples as appropriate.
348 
349     /* Given logically-specified (x, y, z) byte within swizzled surface,
350     function returns byte's linear/memory offset from surface's base--i.e. it
351     performs the swizzled, spatial-to-linear mapping.
352 
353     Function makes no real effort to perform optimally, since should only used
354     outside loops in CpuSwizzleBlt and similar functions. If any of this
355     functionality was needed in performance path, a custom implementation
356     should be used that limits itself to functionality specifically needed
357     (probably single-dimension, intra-tile offsets) and uses a fast computation
358     (e.g. LUT's, hard-codings, PDEP). */
359 
360 { // ###########################################################################
361 
362     char PDepSupported = -1; // AVX2/BMI2 PDEP (Parallel Deposit) Instruction
363 
364     int SwizzledOffset; // Return value being computed.
365 
366     int TileWidthBits =  POPCNT16(pSwizzle->Mask.x); // Log2(Tile Width in Bytes)
367     int TileHeightBits = POPCNT16(pSwizzle->Mask.y); // Log2(Tile Height)
368     int TileDepthBits =  POPCNT16(pSwizzle->Mask.z); // Log2(Tile Depth or MSAA Samples)
369     int TileSizeBits =   TileWidthBits + TileHeightBits + TileDepthBits; // Log2(Tile Size in Bytes)
370     int TilesPerRow =    Pitch >> TileWidthBits;     // Surface Width in Tiles
371 
372     int Row, Col;   // Tile grid position on surface, of tile containing specified byte.
373     int x, y, z;    // Position of specified byte within tile that contains it.
374 
375     if(PDepSupported == -1)
376     {
377         #if(_MSC_VER >= 1700)
378             #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
379             int CpuInfo[4];
380             __cpuidex(CpuInfo, 7, 0);
381             PDepSupported = ((CpuInfo[1] & (1 << 8)) != 0); // EBX[8] = BMI2
382         #elif ( defined (__BMI2__ ))
383             #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
384             unsigned int eax, ebx, ecx, edx;
385             __cpuid_count(7, 0, eax, ebx, ecx, edx);
386             PDepSupported = ((ebx & (1 << 8)) != 0); // EBX[8] = BMI2
387         #else
388             #define PDEP(Src, Mask) 0
389             PDepSupported = 0;
390         #endif
391     }
392 
393     assert( // Mutually Exclusive Swizzle Positions...
394         (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) ==
395         (pSwizzle->Mask.x + pSwizzle->Mask.y + pSwizzle->Mask.z));
396 
397     assert( // Swizzle Limited to 16-bit (else expand POPCNT'ing)...
398         (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) < (1 << 16));
399 
400     assert( // Pitch is Multiple of Tile Width...
401         Pitch == ((Pitch >> TileWidthBits) << TileWidthBits));
402 
403     { // Break Positioning into Tile-Granular and Intra-Tile Components...
404         assert((OffsetZ >>       TileDepthBits) == 0); // When dealing with 3D tiling, treat as separate single-tile-deep planes.
405         z =     OffsetZ & ((1 << TileDepthBits) - 1);
406 
407         Row =   OffsetY >>       TileHeightBits;
408         y =     OffsetY & ((1 << TileHeightBits) - 1);
409 
410         Col =   OffsetX >>       TileWidthBits;
411         x =     OffsetX & ((1 << TileWidthBits) - 1);
412     }
413 
414     SwizzledOffset = // Start with surface offset of given tile...
415         (Row * TilesPerRow + Col) << TileSizeBits; // <-- Tiles laid across surface in row-major order.
416 
417     // ...then OR swizzled offset of byte within tile...
418     if(PDepSupported)
419     {
420         SwizzledOffset +=
421             PDEP(x, pSwizzle->Mask.x) +
422             PDEP(y, pSwizzle->Mask.y) +
423             PDEP(z, pSwizzle->Mask.z);
424     }
425     else // PDEP workalike...
426     {
427         int bitIndex = 0, bitMask = 1;
428         int terminationMask = pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z;
429         while(bitMask < terminationMask)
430         {
431             int MaskQ;
432             #define PROCESS(Q) {                    \
433                 MaskQ = bitMask & pSwizzle->Mask.Q; \
434                 SwizzledOffset += Q & MaskQ;        \
435                 Q <<= 1 ^ (MaskQ >> bitIndex);      \
436             }
437             PROCESS(x);
438             PROCESS(y);
439             PROCESS(z);
440 
441             bitIndex++;
442             bitMask <<= 1;
443 
444             #undef PROCESS
445         }
446     }
447 
448     return(SwizzledOffset);
449 }
450 
451 
CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE * pDest,CPU_SWIZZLE_BLT_SURFACE * pSrc,int CopyWidthBytes,int CopyHeight)452 void CpuSwizzleBlt( // #########################################################
453 
454     /* Performs specified swizzling BLT between two given surfaces. */
455 
456     CPU_SWIZZLE_BLT_SURFACE *pDest,         // Pointer to destination surface descriptor.
457     CPU_SWIZZLE_BLT_SURFACE *pSrc,          // Pointer to source surface descriptor.
458     int                     CopyWidthBytes, // Width of BLT rectangle, in bytes.
459     int                     CopyHeight)     // Height of BLT rectangle, in physical/pitch rows.
460 
461     #ifdef SUB_ELEMENT_SUPPORT
462 
463         /* When copying between surfaces with different pixel pitches, specify
464         CopyWidthBytes in terms of unswizzled surface's element-pitches:
465 
466             CopyWidthBytes = CopyWidthPixels * pLinearSurface.Element.Pitch; */
467 
468     #endif
469 
470 { // ###########################################################################
471 
472     CPU_SWIZZLE_BLT_SURFACE *pLinearSurface, *pSwizzledSurface;
473     int LinearToSwizzled;
474 
475     { // One surface swizzled, the other unswizzled (aka "linear")...
476         assert((pDest->pSwizzle != NULL) ^ (pSrc->pSwizzle != NULL));
477 
478         LinearToSwizzled = !pSrc->pSwizzle;
479         if(LinearToSwizzled)
480         {
481             pSwizzledSurface =  pDest;
482             pLinearSurface =    pSrc;
483         }
484         else // Swizzled-to-Linear...
485         {
486             pSwizzledSurface =  pSrc;
487             pLinearSurface =    pDest;
488         }
489     }
490 
491     #ifdef SUB_ELEMENT_SUPPORT
492     {
493         assert( // Either both or neither specified...
494             (pDest->Element.Pitch != 0) == (pSrc->Element.Pitch != 0));
495 
496         assert( // Surfaces agree on transfer element size...
497             pDest->Element.Size == pSrc->Element.Size);
498 
499         assert( // Element pitch not specified without element size...
500             !(pDest->Element.Pitch && !pDest->Element.Size));
501 
502         assert( // Legit element sizes...
503             (pDest->Element.Size <= pDest->Element.Pitch) &&
504             (pSrc->Element.Size <= pSrc->Element.Pitch));
505 
506         assert( // Sub-element CopyWidthBytes in terms of LinearSurface pitch...
507             (pLinearSurface->Element.Pitch == 0) ||
508             ((CopyWidthBytes % pLinearSurface->Element.Pitch) == 0));
509     }
510     #endif
511 
512     { // No surface overrun...
513         int NoOverrun =
514             #ifdef SUB_ELEMENT_SUPPORT
515             (
516                 // Sub-element transfer...
517                 ((pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
518                     (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch)) &&
519                 // No overrun...
520                 ((pLinearSurface->OffsetX + CopyWidthBytes) <=
521                     (pLinearSurface->Pitch +
522                      // CopyWidthBytes's inclusion of uncopied bytes...
523                      (pLinearSurface->Element.Pitch - pLinearSurface->Element.Size))) &&
524                 ((pLinearSurface->OffsetY + CopyHeight) <= pLinearSurface->Height) &&
525                 ((pSwizzledSurface->OffsetX +
526                     // Adjust CopyWidthBytes from being in terms of LinearSurface pitch...
527                     (CopyWidthBytes / pLinearSurface->Element.Pitch * pSwizzledSurface->Element.Pitch)
528                     ) <=
529                     (pSwizzledSurface->Pitch +
530                      // CopyWidthBytes's inclusion of uncopied bytes...
531                      (pSwizzledSurface->Element.Pitch - pSwizzledSurface->Element.Size))) &&
532                 ((pSwizzledSurface->OffsetY + CopyHeight) <= pSwizzledSurface->Height)
533             ) ||
534             #endif
535 
536             ((pDest->OffsetX + CopyWidthBytes) <= pDest->Pitch) &&
537             ((pDest->OffsetY + CopyHeight) <= pDest->Height) &&
538             ((pSrc->OffsetX + CopyWidthBytes) <= pSrc->Pitch) &&
539             ((pSrc->OffsetY + CopyHeight) <= pSrc->Height);
540 
541         assert(NoOverrun);
542     }
543 
544     { // No surface overlap...
545         char *pDest0 = (char *) pDest->pBase;
546         char *pDest1 = (char *) pDest->pBase + pDest->Pitch * CopyHeight;
547         char *pSrc0 =  (char *)  pSrc->pBase;
548         char *pSrc1 =  (char *)  pSrc->pBase +  pSrc->Pitch * CopyHeight;
549 
550         assert(!(
551             ((pDest0 >= pSrc0) && (pDest0 < pSrc1)) ||
552             ((pSrc0 >= pDest0) && (pSrc0 < pDest1))));
553     }
554 
555     {
556         /* BLT will have pointer in each surface between which data will be
557         copied from source to destination. Each pointer will be appropriately
558         incremented/positioned through its surface, as BLT rectangle is
559         traversed. */
560 
561         char *pLinearAddress, *pSwizzledAddress;
562 
563         // Convenient to track traversal in swizzled surface offsets...
564         int x0 = pSwizzledSurface->OffsetX;
565         int x1 = x0 + CopyWidthBytes;
566         int y0 = pSwizzledSurface->OffsetY;
567         int y1 = y0 + CopyHeight;
568         int x, y;
569 
570         // Start linear pointer at specified base...
571         pLinearAddress =
572             (char *) pLinearSurface->pBase +
573             pLinearSurface->OffsetY * pLinearSurface->Pitch +
574             pLinearSurface->OffsetX;
575 
576         #ifdef MINIMALIST // Simple implementation for functional understanding/testing/etc.
577         {
578             #ifdef SUB_ELEMENT_SUPPORT
579                 assert( // No Sub-Element Transfer...
580                     (pLinearSurface->Element.Size == pLinearSurface->Element.Pitch) &&
581                     (pSwizzledSurface->Element.Size == pSwizzledSurface->Element.Pitch));
582             #endif
583 
584             for(y = y0; y < y1; y++)
585             {
586                 for(x = x0; x < x1; x++)
587                 {
588                     pSwizzledAddress =
589                         (char *) pSwizzledSurface->pBase +
590                         SwizzleOffset(
591                             pSwizzledSurface->pSwizzle,
592                             pSwizzledSurface->Pitch,
593                             x, y, pSwizzledSurface->OffsetZ);
594 
595                     if(LinearToSwizzled)
596                     {
597                         *pSwizzledAddress = *pLinearAddress;
598                     }
599                     else
600                     {
601                         *pLinearAddress = *pSwizzledAddress;
602                     }
603 
604                     pLinearAddress++;
605                 }
606 
607                 pLinearAddress += pLinearSurface->Pitch - CopyWidthBytes;
608             }
609         }
610         #else // Production/Performance Implementation...
611         {
612             /* Key Performance Gains from...
613                 (1) Efficient Memory Transfers (Ordering + Instruction)
614                 (2) Minimizing Work in Inner Loops */
615 
616             #if(_MSC_VER >= 1600)
617                 #include <stdint.h>
618 
619                 #pragma warning(push)
620                 #pragma warning(disable:4127) // Constant Conditional Expressions
621 
622                 unsigned long LOW_BIT_Index;
623                 #define LOW_BIT(x)  (_BitScanForward(&LOW_BIT_Index, (x)), LOW_BIT_Index)
624 
625                 unsigned long HIGH_BIT_Index;
626                 #define HIGH_BIT(x) (_BitScanReverse(&HIGH_BIT_Index, (x)), HIGH_BIT_Index)
627             #elif(__GNUC__ >= 4)
628                 #include <stdint.h>
629 
630                 #define LOW_BIT(x)  __builtin_ctz(x)
631                 #define HIGH_BIT(x) ((sizeof(x) * CHAR_BIT - 1) - __builtin_clz(x))
632             #else
633                 #error "Unexpected compiler!"
634             #endif
635 
636             typedef struct ___m24
637             {
638                 uint8_t byte[3];
639             } __m24; // 24-bit/3-byte memory element.
640 
641             // Macros intended to compile to various types of "load register from memory" instructions...
642             #define MOVB_R(  Reg, Src) (*(uint8_t  *)&(Reg) = *(uint8_t  *)(Src))
643             #define MOVW_R(  Reg, Src) (*(uint16_t *)&(Reg) = *(uint16_t *)(Src))
644             #define MOV3_R(  Reg, Src) (*(__m24    *)&(Reg) = *(__m24 *)(Src))
645             #define MOVD_R(  Reg, Src) (*(uint32_t *)&(Reg) = *(uint32_t *)(Src))
646 
647             #define MOVQ_R(  Reg, Src) ((Reg) = _mm_loadl_epi64((__m128i *)(Src)))
648             #define MOVDQ_R( Reg, Src) ((Reg) = _mm_load_si128( (__m128i *)(Src)))
649             #define MOVDQU_R(Reg, Src) ((Reg) = _mm_loadu_si128((__m128i *)(Src)))
650 
651             // As above, but the other half: "store to memory from register"...
652             #define MOVB_M(    Dest, Reg)(*(uint8_t  *)(Dest) = *(uint8_t  *)&(Reg))
653             #define MOVW_M(    Dest, Reg)(*(uint16_t *)(Dest) = *(uint16_t *)&(Reg))
654             #define MOV3_M(    Dest, Reg)(*(__m24    *)(Dest) = *(__m24    *)&(Reg))
655             #define MOVD_M(    Dest, Reg)(*(uint32_t *)(Dest) = *(uint32_t *)&(Reg))
656 
657             #define MOVQ_M(    Dest, Reg)(_mm_storel_epi64((__m128i *)(Dest), (Reg)))
658             #define MOVDQ_M(   Dest, Reg)(_mm_store_si128( (__m128i *)(Dest), (Reg)))
659             #define MOVDQU_M(  Dest, Reg)(_mm_storeu_si128((__m128i *)(Dest), (Reg)))
660             #define MOVNTDQ_M( Dest, Reg)(_mm_stream_si128((__m128i *)(Dest), (Reg)))
661 
662 
663             #define MIN_CONTAINED_POW2_BELOW_CAP(x, Cap) (1 << LOW_BIT((1 << LOW_BIT(x)) | (1 << HIGH_BIT(Cap))))
664 
665             #define SWIZZLE_OFFSET(OffsetX, OffsetY, OffsetZ) \
666                 SwizzleOffset(pSwizzledSurface->pSwizzle, pSwizzledSurface->Pitch, OffsetX, OffsetY, OffsetZ)
667 
668             #define MAX_XFER_WIDTH  16  // See "Compute Transfer Dimensions".
669             #define MAX_XFER_HEIGHT 4   // "
670 
671             char StreamingLoadSupported = -1; // SSE4.1: MOVNTDQA
672 
673             int TileWidthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.x);   // Log2(Tile Width in Bytes)
674             int TileHeightBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.y);  // Log2(Tile Height)
675             int TileDepthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.z);   // Log2(Tile Depth or MSAA Samples)
676             int BytesPerRowOfTiles = pSwizzledSurface->Pitch << (TileDepthBits + TileHeightBits);
677 
678             struct { int LeftCrust, MainRun, RightCrust; } CopyWidth;
679             int MaskX[MAX_XFER_WIDTH + 1], MaskY[MAX_XFER_HEIGHT + 1];
680             int SwizzledOffsetX0, SwizzledOffsetY;
681             struct { int Width, Height; } SwizzleMaxXfer;
682 
683             char *pSwizzledAddressCopyBase =
684                 (char *) pSwizzledSurface->pBase +
685                 SWIZZLE_OFFSET(0, 0, pSwizzledSurface->OffsetZ);
686 
687             assert(sizeof(__m24) == 3);
688 
689             if(StreamingLoadSupported == -1)
690             {
691                 #if(_MSC_VER >= 1500)
692                     #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
693                     int CpuInfo[4];
694                     __cpuid(CpuInfo, 1);
695                     StreamingLoadSupported = ((CpuInfo[2] & (1 << 19)) != 0); // ECX[19] = SSE4.1
696                 #elif((defined __clang__) || (__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))
697                     #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
698                     unsigned int eax, ebx, ecx, edx;
699                     __cpuid(1, eax, ebx, ecx, edx);
700                     StreamingLoadSupported = ((ecx & (1 << 19)) != 0); // ECX[19] = SSE4.1
701                 #else
702                     #define MOVNTDQA_R(Reg, Src) ((Reg) = (Reg))
703                     StreamingLoadSupported = 0;
704                 #endif
705             }
706 
707             { // Compute Transfer Dimensions...
708 
709                 /* When transferring between linear and swizzled surfaces, we
710                 can't traverse linearly through memory of both since they have
711                 drastically different memory orderings--Moving linearly through
712                 one means bouncing around the other.
713 
714                 Moving linearly through linear surface is more programmatically
715                 convenient--especially when BLT rectangles not constrained to
716                 tile boundaries. But moving linearly through swizzled surface
717                 memory is often more performance-friendly--especially when that
718                 memory is CPU-mapped as WC (Write Combining), which is often
719                 the case for graphics memory.
720 
721                 Fortunately, we can avoid shortcomings of both extremes by
722                 using hybrid traversal: Traverse mostly linearly through linear
723                 surface, but have innermost loop transfer small 2D chunks sized
724                 to use critical runs of linearity in the swizzled memory.
725 
726                 The "critical runs of linearity" that we want to hit in the
727                 sizzled memory are aligned, cache-line-sized memory chunks. If
728                 we bounce around with finer granularity we'll incur penalties
729                 of partial WC buffer use (whether from WC memory use or non-
730                 temporal stores).
731 
732                 The size of 2D chunks with cache-line-sized linearity in
733                 swizzled memory is determined by swizzle mapping's low-order
734                 six bits (for 64-byte cache lines). Most swizzles use
735                 "Y Y X X X X" in their low-order bits, which means their cache
736                 lines store 16x4 chunks--So our implementation will use those
737                 dimensions as our target/maximum 2D transfer chunk. If we had
738                 any 8x8 (or taller) swizzles, we should add such support and
739                 increase our maximum chunk height. If we had any 32x2 swizzles,
740                 we should add such support and increase our maximum chunk width.
741 
742                 Our implementation only bothers optimizing for 2D transfer
743                 chunks stored in row-major order--i.e. those whose swizzle
744                 mapping bits have a series of X's in the low-order, followed by
745                 Y's in the higher-order. Where a swizzle mapping inflection
746                 from Y back to X occurs, contiguous row-ordering is lost, and
747                 we would use that smaller, row-ordered chunk size. */
748 
749                 int TargetMask;
750 
751                 // Narrow optimized transfer Width by looking for inflection from X's...
752                 SwizzleMaxXfer.Width = MAX_XFER_WIDTH;
753                 while(  (TargetMask = SwizzleMaxXfer.Width - 1) &&
754                         ((pSwizzledSurface->pSwizzle->Mask.x & TargetMask) != TargetMask))
755                 {
756                     SwizzleMaxXfer.Width >>= 1;
757                 }
758 
759                 // Narrow optimized transfer height by looking for inflection from Y's...
760                 SwizzleMaxXfer.Height = MAX_XFER_HEIGHT;
761 
762                 while(  (TargetMask = (SwizzleMaxXfer.Height - 1) * SwizzleMaxXfer.Width) &&
763                         ((pSwizzledSurface->pSwizzle->Mask.y & TargetMask) != TargetMask))
764                 {
765                     SwizzleMaxXfer.Height >>= 1;
766                 }
767             }
768 
769             { // Separate CopyWidthBytes into unaligned left/right "crust" and aligned "MainRun"...
770                 int MaxXferWidth = MIN_CONTAINED_POW2_BELOW_CAP(SwizzleMaxXfer.Width, CopyWidthBytes);
771 
772                 CopyWidth.LeftCrust = // i.e. "bytes to xfer-aligned boundary"
773                     (MaxXferWidth - x0) & (MaxXferWidth - 1); // Simplification of ((MaxXferWidth - (x0 % MaxXferWidth)) % MaxXferWidth)
774 
775                 CopyWidth.MainRun =
776                     (CopyWidthBytes - CopyWidth.LeftCrust) & ~(SwizzleMaxXfer.Width - 1); // MainRun is of SwizzleMaxXfer.Width's--not MaxXferWidth's.
777 
778                 CopyWidth.RightCrust = CopyWidthBytes - (CopyWidth.LeftCrust + CopyWidth.MainRun);
779 
780                 #ifdef SUB_ELEMENT_SUPPORT
781                 {
782                     // For partial-pixel transfers, there is no crust and MainRun is done pixel-by-pixel...
783                     if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
784                         (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
785                     {
786                         CopyWidth.LeftCrust = CopyWidth.RightCrust = 0;
787                         CopyWidth.MainRun = CopyWidthBytes;
788                     }
789                 }
790                 #endif
791             }
792 
793 
794             /* Unlike in MINIMALIST implementation, which fully computes
795             swizzled offset for each transfer element, we want to minimize work
796             done in our inner loops.
797 
798             One way we'll reduce work is to separate pSwizzledAddress into
799             dimensional components--e.g. so Y-swizzling doesn't have to be
800             recomputed in X-loop.
801 
802             But a more powerful way we'll reduce work is...Instead of linearly
803             incrementing spatial offsets and then converting to their swizzled
804             counterparts, we'll compute swizzled bases outside the loops and
805             keep them swizzled using swizzled incrementing inside the loops--
806             since swizzled incrementing can be much cheaper than repeatedly
807             swizzling spatial offsets.
808 
809             Intra-tile swizzled incrementing can be done by using the inverse
810             of a spatial component's swizzle mask to ripple-carry a +1 to and
811             across the bits of a currently swizzled value--e.g. with...
812 
813                 SwizzledOffsetY:   Y X Y X Y Y X X X X
814                          ~MaskY:   0 1 0 1 0 0 1 1 1 1
815                                  +                   1
816                                 -----------------------
817 
818             ...set low-order ~MaskY bits will always ripple-carry the
819             incrementing +1 to wherever Y0 happens to be, and wherever there is
820             an arithmetic carry out of one Y position, set ~MaskY bits will
821             carry it across any gaps to the next Y position.
822 
823             The above algorithm only works for adding one, but the mask used
824             can be modified to deliver the +1 to any bit location, so any power
825             of two increment can be achieved.
826 
827             After swizzled increment, residue from mask addition and undesired
828             carries outside targeted fields must be removed using the natural
829             mask--So the final intra-tile swizzled increment is...
830 
831                 SwizzledOffsetQ = (SwizzledOffsetQ + ~MaskQ + 1) & MaskQ
832                     ...where Q is the applicable X/Y/Z dimensional component.
833 
834                 Or since in two's compliment, (~MaskQ + 1) = -MaskQ...
835 
836                 SwizzledOffsetQ = (SwizzledOffsetQ - MaskQ) & MaskQ
837 
838             Since tile sizes are powers of two and tiles laid out in row-major
839             order across surface, the above swizzled incrementing can
840             additionally be used for inter-tile incrementing of X component by
841             extending applicable mask to include offset bits beyond the tile--
842             so arithmetic carries out of intra-tile X component will ripple to
843             advance swizzled inter-tile X offset to next tile. Same is not true
844             of inter-tile Y incrementing since surface pitches not restricted
845             to powers of two. */
846 
847             { // Compute Mask[IncSize] for Needed Increment Values...
848                 int ExtendedMaskX = // Bits beyond the tile (so X incrementing can operate inter-tile)...
849                     ~(pSwizzledSurface->pSwizzle->Mask.x |
850                       pSwizzledSurface->pSwizzle->Mask.y |
851                       pSwizzledSurface->pSwizzle->Mask.z);
852 
853                 /* Subtraction below delivers natural mask for +1 increment,
854                 and appropriately altered mask to deliver +1 to higher bit
855                 positions for +2/4/8/etc. increments. */
856 
857                 for(x = SwizzleMaxXfer.Width; x >= 1; x >>= 1)
858                 {
859                     MaskX[x] = SWIZZLE_OFFSET((1 << TileWidthBits) - x, 0, 0) | ExtendedMaskX;
860                 }
861 
862                 for(y = SwizzleMaxXfer.Height; y >= 1; y >>= 1)
863                 {
864                     MaskY[y] = SWIZZLE_OFFSET(0, (1 << TileHeightBits) - y, 0);
865                 }
866             }
867 
868             { // Base Dimensional Swizzled Offsets...
869                 int IntraTileY = y0 & ((1 << TileHeightBits) - 1);
870                 int TileAlignedY = y0 - IntraTileY;
871 
872                 SwizzledOffsetY = SWIZZLE_OFFSET(0, IntraTileY, 0);
873 
874                 SwizzledOffsetX0 =
875                     SWIZZLE_OFFSET(
876                         x0,
877                         TileAlignedY, // <-- Since SwizzledOffsetX will include "bits beyond the tile".
878                         0);
879             }
880 
881             // BLT Loops ///////////////////////////////////////////////////////
882 
883             /* Traverse BLT rectangle, transferring small, optimally-aligned 2D
884             chunks, as appropriate for given swizzle format. Use swizzled
885             incrementing of dimensional swizzled components. */
886 
887             for(y = y0; y < y1; )
888             {
889                 char *pSwizzledAddressLine = pSwizzledAddressCopyBase + SwizzledOffsetY;
890                 int xferHeight =
891                     // Largest pow2 xfer height that alignment, MaxXfer, and lines left will permit...
892                     MIN_CONTAINED_POW2_BELOW_CAP(y | SwizzleMaxXfer.Height, y1 - y);
893                 int SwizzledOffsetX = SwizzledOffsetX0;
894 
895                 __m128i xmm[MAX_XFER_HEIGHT];
896                 char *pLinearAddressEnd;
897                 int _MaskX;
898 
899                 // XFER Macros /////////////////////////////////////////////////
900 
901                 /* We'll define "XFER" macro to contain BLT X-loop work.
902 
903                 In simple implementation, XFER would be WHILE loop that does
904                 SSE transfer and performs pointer and swizzled offset
905                 incrementing.
906 
907                 ...but we have multiple conditions to handle...
908                   - Transfer Direction (Linear <--> Swizzled)
909                   - Optimal 2D Transfer Chunk Size
910                   - Available/Desired CPU Transfer Instructions
911                   - Unaligned Crust
912 
913                 Don't want X-loop to have conditional logic to handle
914                 variations since would retard performance--but neither do we
915                 want messy multitude of slightly different, copy-pasted code
916                 paths. So instead, XFER macro will provide common code template
917                 allowing instantiation of multiple X-loop variations--i.e. XFER
918                 calls from conditional Y-loop code will expand into separate,
919                 conditional-free, "lean and mean" X-loops.
920 
921                 Some conditional logic remains in XFER chain--but only outside
922                 X-loop. The two IF statements that remain in X-loop (i.e. those
923                 in XFER_LOAD/STORE) expand to compile-time constant conditional
924                 expressions, so with optimizing compiler, no runtime-
925                 conditional code will be generated--i.e. constant conditionals
926                 will simply decide whether given instantiation has that code or
927                 not. */
928 
929                 #define XFER(XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
930                 {                                                                                                   \
931                          XFER_LINES(4, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
932                     else XFER_LINES(2, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
933                     else XFER_LINES(1, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust);\
934                 }
935 
936                 #define XFER_LINES(XFER_LINES_Lines, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
937                     if(xferHeight == (XFER_LINES_Lines))    \
938                     {                                       \
939                         if(XFER_Crust)                      \
940                         {                                   \
941                             XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.LeftCrust  & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
942                             XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.LeftCrust  & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
943                             XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.LeftCrust  & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
944                             XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.LeftCrust  & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
945                         }                                   \
946                                                             \
947                         XFER_SPAN(XFER_Store, XFER_Load, CopyWidth.MainRun, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch);\
948                                                             \
949                         if(XFER_Crust)                      \
950                         {                                   \
951                             XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.RightCrust & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
952                             XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.RightCrust & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
953                             XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.RightCrust & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
954                             XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.RightCrust & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
955                         }                                   \
956                     }
957 
958                 #define XFER_SPAN(XFER_Store, XFER_Load, XFER_CopyWidthBytes, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_Height, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch) \
959                 {                                                                           \
960                     pLinearAddressEnd = pLinearAddress + (XFER_CopyWidthBytes);             \
961                     _MaskX = MaskX[XFER_Pitch_Swizzled];                                    \
962                     while(pLinearAddress < pLinearAddressEnd)                               \
963                     {                                                                       \
964                         pSwizzledAddress = pSwizzledAddressLine + SwizzledOffsetX;          \
965                                                                                             \
966                         XFER_LOAD(0, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
967                         XFER_LOAD(1, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
968                         XFER_LOAD(2, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
969                         XFER_LOAD(3, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
970                         XFER_STORE(0, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
971                         XFER_STORE(1, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
972                         XFER_STORE(2, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
973                         XFER_STORE(3, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
974                                                                                             \
975                         SwizzledOffsetX = (SwizzledOffsetX - _MaskX) & _MaskX;              \
976                         pLinearAddress += (XFER_Pitch_Linear);                              \
977                     }                                                                       \
978                 }
979 
980                 #define XFER_LOAD(XFER_Line, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height) \
981                 {                                                           \
982                     if((XFER_Line) < (XFER_Height))                         \
983                     {                                                       \
984                         XFER_Load(                                          \
985                             xmm[XFER_Line],                                 \
986                             (XFER_pSrc) + (XFER_Line) * (XFER_SrcPitch));   \
987                     }                                                       \
988                 }
989 
990                 #define XFER_STORE(XFER_Line, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height) \
991                 {                                                           \
992                     if((XFER_Line) < (XFER_Height))                         \
993                     {                                                       \
994                         XFER_Store(                                         \
995                             (XFER_pDest) + (XFER_Line) * (XFER_DestPitch),  \
996                             xmm[XFER_Line]);                                \
997                     }                                                       \
998                 }
999 
1000                 // Perform Applicable Transfer /////////////////////////////////
1001                 assert( // DQ Alignment...
1002                     ((intptr_t) pSwizzledSurface->pBase % 16 == 0) &&
1003                     (pSwizzledSurface->Pitch % 16 == 0));
1004 
1005                 #ifdef SUB_ELEMENT_SUPPORT
1006                     if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
1007                         (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
1008                     {
1009                         if(LinearToSwizzled)
1010                         {
1011                             switch(pLinearSurface->Element.Size)
1012                             {
1013                                 case 16: XFER(MOVNTDQ_M, MOVDQU_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1014                                 case  8: XFER(   MOVQ_M,   MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1015                                 case  4: XFER(   MOVD_M,   MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1016                                 case  3: XFER(   MOV3_M,   MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1017                                 case  2: XFER(   MOVW_M,   MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1018                                 case  1: XFER(   MOVB_M,   MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1019                                 default: assert(0);
1020                             }
1021                         }
1022                         else
1023                         {
1024                             switch(pLinearSurface->Element.Size)
1025                             {
1026                                 case 16:
1027                                 {
1028                                     if(StreamingLoadSupported)
1029                                     {
1030                                         XFER(MOVDQU_M, MOVNTDQA_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1031                                     }
1032                                     else
1033                                     {
1034                                         XFER(MOVDQU_M,    MOVDQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1035                                     }
1036                                     break;
1037                                 }
1038                                 case  8: XFER(   MOVQ_M,   MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1039                                 case  4: XFER(   MOVD_M,   MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1040                                 case  3: XFER(   MOV3_M,   MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1041                                 case  2: XFER(   MOVW_M,   MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1042                                 case  1: XFER(   MOVB_M,   MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1043                                 default: assert(0);
1044                             }
1045                         }
1046                     } else
1047                 #endif // SUB_ELEMENT_SUPPORT
1048                 if(LinearToSwizzled)
1049                 {
1050                     switch(SwizzleMaxXfer.Width)
1051                     {
1052                         case 16: XFER(MOVNTDQ_M, MOVDQU_R, 16, 16, pSwizzledAddress, 16, pLinearAddress, pLinearSurface->Pitch, 1); break;
1053                         #ifdef INTEL_TILE_W_SUPPORT
1054                             case  2: XFER(MOVW_M,  MOVW_R,  2,  2, pSwizzledAddress,  2, pLinearAddress, pLinearSurface->Pitch, 1); break;
1055                         #endif
1056                         default: assert(0); // Unexpected cases excluded to save compile time/size of multiplying instantiations.
1057                     }
1058                 }
1059                 else
1060                 {
1061                     switch(SwizzleMaxXfer.Width)
1062                     {
1063                         case 16:
1064                         {
1065                             if(StreamingLoadSupported)
1066                             {
1067                                 XFER(MOVDQU_M, MOVNTDQA_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1068                             }
1069                             else
1070                             {
1071                                 XFER(MOVDQU_M,    MOVDQ_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1072                             }
1073                             break;
1074                         }
1075                         #ifdef INTEL_TILE_W_SUPPORT
1076                             case 2: XFER(MOVW_M,   MOVW_R,  2,  2, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress,  2, 1); break;
1077                         #endif
1078                         default: assert(0);
1079                     }
1080                 }
1081 
1082 
1083                 // Swizzled inc of SwizzledOffsetY...
1084                 SwizzledOffsetY = (SwizzledOffsetY - MaskY[xferHeight]) & MaskY[xferHeight];
1085                 if(!SwizzledOffsetY) SwizzledOffsetX0 += BytesPerRowOfTiles; // Wraps advance SwizzledOffsetX0, since that includes "bits beyond the tile".
1086 
1087                 y += xferHeight;
1088 
1089                 /* X-loop only advanced pLinearAddress by CopyWidthBytes--even
1090                 when transferred multiple lines. Advance rest of way: */
1091                 pLinearAddress += xferHeight * pLinearSurface->Pitch - CopyWidthBytes;
1092 
1093             } // foreach(y)
1094 
1095             _mm_sfence(); // Flush Non-Temporal Writes
1096 
1097             #if(_MSC_VER)
1098                 #pragma warning(pop)
1099             #endif
1100         }
1101         #endif
1102     }
1103 } // CpuSwizzleBlt
1104 
1105 #endif // #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
1106 // clang-format on
1107