1 // Written in the D programming language.
2 
3 /**
4  * Builtin SIMD intrinsics
5  *
6  * Source: $(DRUNTIMESRC core/_simd.d)
7  *
8  * Copyright: Copyright Digital Mars 2012.
9  * License:   $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
10  * Authors:   $(WEB digitalmars.com, Walter Bright),
11  */
12 
13 module core.simd;
14 
15 pure:
16 nothrow:
17 @safe:
18 @nogc:
19 
20 /*******************************
21  * Create a vector type.
22  *
23  * Parameters:
24  *      T = one of double[2], float[4], void[16], byte[16], ubyte[16],
25  *      short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
26  *      For 256 bit vectors,
27  *      one of double[4], float[8], void[32], byte[32], ubyte[32],
28  *      short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
29  */
30 
Vector(T)31 template Vector(T)
32 {
33     /* __vector is compiler magic, hide it behind a template.
34      * The compiler will reject T's that don't work.
35      */
36     alias __vector(T) Vector;
37 }
38 
39 /* Handy aliases
40  */
41 static if (is(Vector!(void[8])))    alias Vector!(void[8])  void8;          ///
42 static if (is(Vector!(double[1])))  alias Vector!(double[1]) double1;       ///
43 static if (is(Vector!(float[2])))   alias Vector!(float[2])  float2;        ///
44 static if (is(Vector!(byte[8])))    alias Vector!(byte[8])  byte8;          ///
45 static if (is(Vector!(ubyte[8])))   alias Vector!(ubyte[8]) ubyte8;         ///
46 static if (is(Vector!(short[4])))   alias Vector!(short[4])  short4;        ///
47 static if (is(Vector!(ushort[4])))  alias Vector!(ushort[4]) ushort4;       ///
48 static if (is(Vector!(int[2])))     alias Vector!(int[2])    int2;          ///
49 static if (is(Vector!(uint[2])))    alias Vector!(uint[2])   uint2;         ///
50 static if (is(Vector!(long[1])))    alias Vector!(long[1])   long1;         ///
51 static if (is(Vector!(ulong[1])))   alias Vector!(ulong[1])  ulong1;        ///
52 
53 static if (is(Vector!(void[16])))   alias Vector!(void[16])  void16;        ///
54 static if (is(Vector!(double[2])))  alias Vector!(double[2]) double2;       ///
55 static if (is(Vector!(float[4])))   alias Vector!(float[4])  float4;        ///
56 static if (is(Vector!(byte[16])))   alias Vector!(byte[16])  byte16;        ///
57 static if (is(Vector!(ubyte[16])))  alias Vector!(ubyte[16]) ubyte16;       ///
58 static if (is(Vector!(short[8])))   alias Vector!(short[8])  short8;        ///
59 static if (is(Vector!(ushort[8])))  alias Vector!(ushort[8]) ushort8;       ///
60 static if (is(Vector!(int[4])))     alias Vector!(int[4])    int4;          ///
61 static if (is(Vector!(uint[4])))    alias Vector!(uint[4])   uint4;         ///
62 static if (is(Vector!(long[2])))    alias Vector!(long[2])   long2;         ///
63 static if (is(Vector!(ulong[2])))   alias Vector!(ulong[2])  ulong2;        ///
64 
65 static if (is(Vector!(void[32])))   alias Vector!(void[32])   void32;        ///
66 static if (is(Vector!(double[4])))  alias Vector!(double[4])  double4;       ///
67 static if (is(Vector!(float[8])))   alias Vector!(float[8])   float8;        ///
68 static if (is(Vector!(byte[32])))   alias Vector!(byte[32])   byte32;        ///
69 static if (is(Vector!(ubyte[32])))  alias Vector!(ubyte[32])  ubyte32;       ///
70 static if (is(Vector!(short[16])))  alias Vector!(short[16])  short16;       ///
71 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16;      ///
72 static if (is(Vector!(int[8])))     alias Vector!(int[8])     int8;          ///
73 static if (is(Vector!(uint[8])))    alias Vector!(uint[8])    uint8;         ///
74 static if (is(Vector!(long[4])))    alias Vector!(long[4])    long4;         ///
75 static if (is(Vector!(ulong[4])))   alias Vector!(ulong[4])   ulong4;        ///
76 
version(D_SIMD)77 version (D_SIMD)
78 {
79   /** XMM opcodes that conform to the following:
80    *
81    *  opcode xmm1,xmm2/mem
82    *
83    * and do not have side effects (i.e. do not write to memory).
84    */
85   enum XMM
86   {
87     ADDSS = 0xF30F58,
88     ADDSD = 0xF20F58,
89     ADDPS = 0x000F58,
90     ADDPD = 0x660F58,
91     PADDB = 0x660FFC,
92     PADDW = 0x660FFD,
93     PADDD = 0x660FFE,
94     PADDQ = 0x660FD4,
95 
96     SUBSS = 0xF30F5C,
97     SUBSD = 0xF20F5C,
98     SUBPS = 0x000F5C,
99     SUBPD = 0x660F5C,
100     PSUBB = 0x660FF8,
101     PSUBW = 0x660FF9,
102     PSUBD = 0x660FFA,
103     PSUBQ = 0x660FFB,
104 
105     MULSS = 0xF30F59,
106     MULSD = 0xF20F59,
107     MULPS = 0x000F59,
108     MULPD = 0x660F59,
109     PMULLW = 0x660FD5,
110 
111     DIVSS = 0xF30F5E,
112     DIVSD = 0xF20F5E,
113     DIVPS = 0x000F5E,
114     DIVPD = 0x660F5E,
115 
116     PAND  = 0x660FDB,
117     POR   = 0x660FEB,
118 
119     UCOMISS = 0x000F2E,
120     UCOMISD = 0x660F2E,
121 
122     XORPS = 0x000F57,
123     XORPD = 0x660F57,
124 
125     // Use STO and LOD instead of MOV to distinguish the direction
126     STOSS  = 0xF30F11,
127     STOSD  = 0xF20F11,
128     STOAPS = 0x000F29,
129     STOAPD = 0x660F29,
130     STODQA = 0x660F7F,
131     STOD   = 0x660F7E,        // MOVD reg/mem64, xmm   66 0F 7E /r
132     STOQ   = 0x660FD6,
133 
134     LODSS  = 0xF30F10,
135     LODSD  = 0xF20F10,
136     LODAPS = 0x000F28,
137     LODAPD = 0x660F28,
138     LODDQA = 0x660F6F,
139     LODD   = 0x660F6E,        // MOVD xmm, reg/mem64   66 0F 6E /r
140     LODQ   = 0xF30F7E,
141 
142     LODDQU   = 0xF30F6F,      // MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
143     STODQU   = 0xF30F7F,      // MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
144     MOVDQ2Q  = 0xF20FD6,      // MOVDQ2Q mmx, xmm          F2 0F D6 /r
145     MOVHLPS  = 0x0F12,        // MOVHLPS xmm1, xmm2        0F 12 /r
146     LODHPD   = 0x660F16,
147     STOHPD   = 0x660F17,      // MOVHPD mem64, xmm         66 0F 17 /r
148     LODHPS   = 0x0F16,
149     STOHPS   = 0x0F17,
150     MOVLHPS  = 0x0F16,
151     LODLPD   = 0x660F12,
152     STOLPD   = 0x660F13,
153     LODLPS   = 0x0F12,
154     STOLPS   = 0x0F13,
155     MOVMSKPD = 0x660F50,
156     MOVMSKPS = 0x0F50,
157     MOVNTDQ  = 0x660FE7,
158     MOVNTI   = 0x0FC3,
159     MOVNTPD  = 0x660F2B,
160     MOVNTPS  = 0x0F2B,
161     MOVNTQ   = 0x0FE7,
162     MOVQ2DQ  = 0xF30FD6,
163     LODUPD   = 0x660F10,
164     STOUPD   = 0x660F11,
165     LODUPS   = 0x0F10,
166     STOUPS   = 0x0F11,
167 
168     PACKSSDW = 0x660F6B,
169     PACKSSWB = 0x660F63,
170     PACKUSWB = 0x660F67,
171     PADDSB = 0x660FEC,
172     PADDSW = 0x660FED,
173     PADDUSB = 0x660FDC,
174     PADDUSW = 0x660FDD,
175     PANDN = 0x660FDF,
176     PCMPEQB = 0x660F74,
177     PCMPEQD = 0x660F76,
178     PCMPEQW = 0x660F75,
179     PCMPGTB = 0x660F64,
180     PCMPGTD = 0x660F66,
181     PCMPGTW = 0x660F65,
182     PMADDWD = 0x660FF5,
183     PSLLW = 0x660FF1,
184     PSLLD = 0x660FF2,
185     PSLLQ = 0x660FF3,
186     PSRAW = 0x660FE1,
187     PSRAD = 0x660FE2,
188     PSRLW = 0x660FD1,
189     PSRLD = 0x660FD2,
190     PSRLQ = 0x660FD3,
191     PSUBSB = 0x660FE8,
192     PSUBSW = 0x660FE9,
193     PSUBUSB = 0x660FD8,
194     PSUBUSW = 0x660FD9,
195     PUNPCKHBW = 0x660F68,
196     PUNPCKHDQ = 0x660F6A,
197     PUNPCKHWD = 0x660F69,
198     PUNPCKLBW = 0x660F60,
199     PUNPCKLDQ = 0x660F62,
200     PUNPCKLWD = 0x660F61,
201     PXOR = 0x660FEF,
202     ANDPD = 0x660F54,
203     ANDPS = 0x0F54,
204     ANDNPD = 0x660F55,
205     ANDNPS = 0x0F55,
206     CMPPS = 0x0FC2,
207     CMPPD = 0x660FC2,
208     CMPSD = 0xF20FC2,
209     CMPSS = 0xF30FC2,
210     COMISD = 0x660F2F,
211     COMISS = 0x0F2F,
212     CVTDQ2PD = 0xF30FE6,
213     CVTDQ2PS = 0x0F5B,
214     CVTPD2DQ = 0xF20FE6,
215     CVTPD2PI = 0x660F2D,
216     CVTPD2PS = 0x660F5A,
217     CVTPI2PD = 0x660F2A,
218     CVTPI2PS = 0x0F2A,
219     CVTPS2DQ = 0x660F5B,
220     CVTPS2PD = 0x0F5A,
221     CVTPS2PI = 0x0F2D,
222     CVTSD2SI = 0xF20F2D,
223     CVTSD2SS = 0xF20F5A,
224     CVTSI2SD = 0xF20F2A,
225     CVTSI2SS = 0xF30F2A,
226     CVTSS2SD = 0xF30F5A,
227     CVTSS2SI = 0xF30F2D,
228     CVTTPD2PI = 0x660F2C,
229     CVTTPD2DQ = 0x660FE6,
230     CVTTPS2DQ = 0xF30F5B,
231     CVTTPS2PI = 0x0F2C,
232     CVTTSD2SI = 0xF20F2C,
233     CVTTSS2SI = 0xF30F2C,
234     MASKMOVDQU = 0x660FF7,
235     MASKMOVQ = 0x0FF7,
236     MAXPD = 0x660F5F,
237     MAXPS = 0x0F5F,
238     MAXSD = 0xF20F5F,
239     MAXSS = 0xF30F5F,
240     MINPD = 0x660F5D,
241     MINPS = 0x0F5D,
242     MINSD = 0xF20F5D,
243     MINSS = 0xF30F5D,
244     ORPD = 0x660F56,
245     ORPS = 0x0F56,
246     PAVGB = 0x660FE0,
247     PAVGW = 0x660FE3,
248     PMAXSW = 0x660FEE,
249     //PINSRW = 0x660FC4,
250     PMAXUB = 0x660FDE,
251     PMINSW = 0x660FEA,
252     PMINUB = 0x660FDA,
253     //PMOVMSKB = 0x660FD7,
254     PMULHUW = 0x660FE4,
255     PMULHW = 0x660FE5,
256     PMULUDQ = 0x660FF4,
257     PSADBW = 0x660FF6,
258     PUNPCKHQDQ = 0x660F6D,
259     PUNPCKLQDQ = 0x660F6C,
260     RCPPS = 0x0F53,
261     RCPSS = 0xF30F53,
262     RSQRTPS = 0x0F52,
263     RSQRTSS = 0xF30F52,
264     SQRTPD = 0x660F51,
265     SHUFPD = 0x660FC6,
266     SHUFPS = 0x0FC6,
267     SQRTPS = 0x0F51,
268     SQRTSD = 0xF20F51,
269     SQRTSS = 0xF30F51,
270     UNPCKHPD = 0x660F15,
271     UNPCKHPS = 0x0F15,
272     UNPCKLPD = 0x660F14,
273     UNPCKLPS = 0x0F14,
274 
275     PSHUFD = 0x660F70,
276     PSHUFHW = 0xF30F70,
277     PSHUFLW = 0xF20F70,
278     PSHUFW = 0x0F70,
279     PSLLDQ = 0x07660F73,
280     PSRLDQ = 0x03660F73,
281 
282     //PREFETCH = 0x0F18,
283 
284 // SSE3 Pentium 4 (Prescott)
285 
286     ADDSUBPD = 0x660FD0,
287     ADDSUBPS = 0xF20FD0,
288     HADDPD   = 0x660F7C,
289     HADDPS   = 0xF20F7C,
290     HSUBPD   = 0x660F7D,
291     HSUBPS   = 0xF20F7D,
292     MOVDDUP  = 0xF20F12,
293     MOVSHDUP = 0xF30F16,
294     MOVSLDUP = 0xF30F12,
295     LDDQU    = 0xF20FF0,
296     MONITOR  = 0x0F01C8,
297     MWAIT    = 0x0F01C9,
298 
299 // SSSE3
300     PALIGNR = 0x660F3A0F,
301     PHADDD = 0x660F3802,
302     PHADDW = 0x660F3801,
303     PHADDSW = 0x660F3803,
304     PABSB = 0x660F381C,
305     PABSD = 0x660F381E,
306     PABSW = 0x660F381D,
307     PSIGNB = 0x660F3808,
308     PSIGND = 0x660F380A,
309     PSIGNW = 0x660F3809,
310     PSHUFB = 0x660F3800,
311     PMADDUBSW = 0x660F3804,
312     PMULHRSW = 0x660F380B,
313     PHSUBD = 0x660F3806,
314     PHSUBW = 0x660F3805,
315     PHSUBSW = 0x660F3807,
316 
317 // SSE4.1
318 
319     BLENDPD   = 0x660F3A0D,
320     BLENDPS   = 0x660F3A0C,
321     BLENDVPD  = 0x660F3815,
322     BLENDVPS  = 0x660F3814,
323     DPPD      = 0x660F3A41,
324     DPPS      = 0x660F3A40,
325     EXTRACTPS = 0x660F3A17,
326     INSERTPS  = 0x660F3A21,
327     MPSADBW   = 0x660F3A42,
328     PBLENDVB  = 0x660F3810,
329     PBLENDW   = 0x660F3A0E,
330     PEXTRD    = 0x660F3A16,
331     PEXTRQ    = 0x660F3A16,
332     PINSRB    = 0x660F3A20,
333     PINSRD    = 0x660F3A22,
334     PINSRQ    = 0x660F3A22,
335 
336     MOVNTDQA = 0x660F382A,
337     PACKUSDW = 0x660F382B,
338     PCMPEQQ = 0x660F3829,
339     PEXTRB = 0x660F3A14,
340     PHMINPOSUW = 0x660F3841,
341     PMAXSB = 0x660F383C,
342     PMAXSD = 0x660F383D,
343     PMAXUD = 0x660F383F,
344     PMAXUW = 0x660F383E,
345     PMINSB = 0x660F3838,
346     PMINSD = 0x660F3839,
347     PMINUD = 0x660F383B,
348     PMINUW = 0x660F383A,
349     PMOVSXBW = 0x660F3820,
350     PMOVSXBD = 0x660F3821,
351     PMOVSXBQ = 0x660F3822,
352     PMOVSXWD = 0x660F3823,
353     PMOVSXWQ = 0x660F3824,
354     PMOVSXDQ = 0x660F3825,
355     PMOVZXBW = 0x660F3830,
356     PMOVZXBD = 0x660F3831,
357     PMOVZXBQ = 0x660F3832,
358     PMOVZXWD = 0x660F3833,
359     PMOVZXWQ = 0x660F3834,
360     PMOVZXDQ = 0x660F3835,
361     PMULDQ   = 0x660F3828,
362     PMULLD   = 0x660F3840,
363     PTEST    = 0x660F3817,
364 
365     ROUNDPD = 0x660F3A09,
366     ROUNDPS = 0x660F3A08,
367     ROUNDSD = 0x660F3A0B,
368     ROUNDSS = 0x660F3A0A,
369 
370 // SSE4.2
371     PCMPESTRI  = 0x660F3A61,
372     PCMPESTRM  = 0x660F3A60,
373     PCMPISTRI  = 0x660F3A63,
374     PCMPISTRM  = 0x660F3A62,
375     PCMPGTQ    = 0x660F3837,
376     //CRC32
377 
378 // SSE4a (AMD only)
379     // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
380 
381 // POPCNT and LZCNT (have their own CPUID bits)
382     POPCNT     = 0xF30FB8,
383     // LZCNT
384   }
385 
386   /**
387    * Generate two operand instruction with XMM 128 bit operands.
388    *
389    * This is a compiler magic function - it doesn't behave like
390    * regular D functions.
391    *
392    * Parameters:
393    *      opcode  any of the XMM opcodes; it must be a compile time constant
394    *      op1     first operand
395    *      op2     second operand
396    * Returns:
397    *      result of opcode
398    */
399   pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
400 
401   /**
402    * Unary SIMD instructions.
403    */
404   pure @safe void16 __simd(XMM opcode, void16 op1);
405   pure @safe void16 __simd(XMM opcode, double d);       ///
406   pure @safe void16 __simd(XMM opcode, float f);        ///
407 
408   /****
409    * For instructions:
410    * CMPPD, CMPSS, CMPSD, CMPPS,
411    * PSHUFD, PSHUFHW, PSHUFLW,
412    * BLENDPD, BLENDPS, DPPD, DPPS,
413    * MPSADBW, PBLENDW,
414    * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
415    * Parameters:
416    *      opcode  any of the above XMM opcodes; it must be a compile time constant
417    *      op1     first operand
418    *      op2     second operand
419    *      imm8    third operand; must be a compile time constant
420    * Returns:
421    *      result of opcode
422    */
423   pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
424 
425   /***
426    * For instructions with the imm8 version:
427    * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
428    * PSRLDQ, PSLLDQ
429    * Parameters:
430    *      opcode  any of the XMM opcodes; it must be a compile time constant
431    *      op1     first operand
432    *      imm8    second operand; must be a compile time constant
433    * Returns:
434    *      result of opcode
435    */
436   pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
437 
438   /*****
439    * For "store" operations of the form:
440    *    op1 op= op2
441    * Returns:
442    *    op2
443    * These cannot be marked as pure, as semantic() doesn't check them.
444    */
445   @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
446   @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
447   @safe void16 __simd_sto(XMM opcode, float op1, void16 op2);  ///
448 
449   /* The following use overloading to ensure correct typing.
450    * Compile with inlining on for best performance.
451    */
452 
453   pure @safe short8 pcmpeq()(short8 v1, short8 v2)
454   {
455       return __simd(XMM.PCMPEQW, v1, v2);
456   }
457 
458   pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
459   {
460       return __simd(XMM.PCMPEQW, v1, v2);
461   }
462 
463   /*********************
464    * Emit prefetch instruction.
465    * Params:
466    *    address = address to be prefetched
467    *    writeFetch = true for write fetch, false for read fetch
468    *    locality = 0..3 (0 meaning least local, 3 meaning most local)
469    * Note:
470    *    The Intel mappings are:
471    *    $(TABLE
472    *    $(THEAD writeFetch, locality, Instruction)
473    *    $(TROW false, 0, prefetchnta)
474    *    $(TROW false, 1, prefetch2)
475    *    $(TROW false, 2, prefetch1)
476    *    $(TROW false, 3, prefetch0)
477    *    $(TROW false, 0, prefetchw)
478    *    $(TROW false, 1, prefetchw)
479    *    $(TROW false, 2, prefetchw)
480    *    $(TROW false, 3, prefetchw)
481    *    )
482    */
483   void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
484   {
485         static if (writeFetch)
486             __prefetch(address, 4);
487         else static if (locality < 4)
488             __prefetch(address, 3 - locality);
489         else
490             static assert(0, "0..3 expected for locality");
491   }
492 
493   private void __prefetch(const(void*) address, ubyte encoding);
494 
495   /*************************************
496    * Load unaligned vector from address.
497    * This is a compiler intrinsic.
498    * Params:
499    *    p = pointer to vector
500    * Returns:
501    *    vector
502    */
503 
504   V loadUnaligned(V)(const V* p)
505         if (is(V == void16) ||
506             is(V == byte16) ||
507             is(V == ubyte16) ||
508             is(V == short8) ||
509             is(V == ushort8) ||
510             is(V == int4) ||
511             is(V == uint4) ||
512             is(V == long2) ||
513             is(V == ulong2))
514   {
515         pragma(inline, true);
516         static if (is(V == double2))
517             return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
518         else static if (is(V == float4))
519             return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
520         else
521             return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
522   }
523 
524   /*************************************
525    * Store vector to unaligned address.
526    * This is a compiler intrinsic.
527    * Params:
528    *    p = pointer to vector
529    *    value = value to store
530    * Returns:
531    *    value
532    */
533 
534   V storeUnaligned(V)(V* p, V value)
535         if (is(V == void16) ||
536             is(V == byte16) ||
537             is(V == ubyte16) ||
538             is(V == short8) ||
539             is(V == ushort8) ||
540             is(V == int4) ||
541             is(V == uint4) ||
542             is(V == long2) ||
543             is(V == ulong2))
544   {
545         pragma(inline, true);
546         static if (is(V == double2))
547             return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
548         else static if (is(V == float4))
549             return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
550         else
551             return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
552   }
553 }
554