1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7package main
8
9import (
10	"bytes"
11	"io/ioutil"
12	"log"
13	"strings"
14	"text/template"
15)
16
17const (
18	copyright = "" +
19		"// Copyright 2016 The Go Authors. All rights reserved.\n" +
20		"// Use of this source code is governed by a BSD-style\n" +
21		"// license that can be found in the LICENSE file.\n"
22
23	doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n"
24
25	dashDashDash = "// --------"
26)
27
28func main() {
29	tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl")
30	if err != nil {
31		log.Fatalf("ReadFile: %v", err)
32	}
33	if !bytes.HasPrefix(tmpl, []byte(copyright)) {
34		log.Fatal("source template did not start with the copyright header")
35	}
36	tmpl = tmpl[len(copyright):]
37
38	preamble := []byte(nil)
39	if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 {
40		log.Fatalf("source template did not contain %q", dashDashDash)
41	} else {
42		preamble, tmpl = tmpl[:i], tmpl[i:]
43	}
44
45	t, err := template.New("").Parse(string(tmpl))
46	if err != nil {
47		log.Fatalf("Parse: %v", err)
48	}
49
50	out := bytes.NewBuffer(nil)
51	out.WriteString(doNotEdit)
52	out.Write(preamble)
53
54	for i, v := range instances {
55		if i != 0 {
56			out.WriteString("\n")
57		}
58		if strings.Contains(v.LoadArgs, "{{.ShortName}}") {
59			v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1)
60		}
61		if err := t.Execute(out, v); err != nil {
62			log.Fatalf("Execute(%q): %v", v.ShortName, err)
63		}
64	}
65
66	if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil {
67		log.Fatalf("WriteFile: %v", err)
68	}
69}
70
71var instances = []struct {
72	LongName       string
73	ShortName      string
74	FrameSize      string
75	ArgsSize       string
76	Args           string
77	DstElemSize1   int
78	DstElemSize4   int
79	XMM3           string
80	XMM4           string
81	XMM5           string
82	XMM6           string
83	XMM8           string
84	XMM9           string
85	XMM10          string
86	LoadArgs       string
87	Setup          string
88	LoadXMMRegs    string
89	Add            string
90	ClampAndScale  string
91	ConvertToInt32 string
92	Store4         string
93	Store1         string
94}{{
95	LongName:       "fixedAccumulateOpOver",
96	ShortName:      "fxAccOpOver",
97	FrameSize:      fxFrameSize,
98	ArgsSize:       twoArgArgsSize,
99	Args:           "dst []uint8, src []uint32",
100	DstElemSize1:   1 * sizeOfUint8,
101	DstElemSize4:   4 * sizeOfUint8,
102	XMM3:           fxXMM3,
103	XMM4:           fxXMM4,
104	XMM5:           fxXMM5,
105	XMM6:           opOverXMM6,
106	XMM8:           opOverXMM8,
107	XMM9:           opOverXMM9,
108	XMM10:          opOverXMM10,
109	LoadArgs:       twoArgLoadArgs,
110	Setup:          fxSetup,
111	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opOverLoadXMMRegs,
112	Add:            fxAdd,
113	ClampAndScale:  fxClampAndScale,
114	ConvertToInt32: fxConvertToInt32,
115	Store4:         opOverStore4,
116	Store1:         opOverStore1,
117}, {
118	LongName:       "fixedAccumulateOpSrc",
119	ShortName:      "fxAccOpSrc",
120	FrameSize:      fxFrameSize,
121	ArgsSize:       twoArgArgsSize,
122	Args:           "dst []uint8, src []uint32",
123	DstElemSize1:   1 * sizeOfUint8,
124	DstElemSize4:   4 * sizeOfUint8,
125	XMM3:           fxXMM3,
126	XMM4:           fxXMM4,
127	XMM5:           fxXMM5,
128	XMM6:           opSrcXMM6,
129	XMM8:           opSrcXMM8,
130	XMM9:           opSrcXMM9,
131	XMM10:          opSrcXMM10,
132	LoadArgs:       twoArgLoadArgs,
133	Setup:          fxSetup,
134	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
135	Add:            fxAdd,
136	ClampAndScale:  fxClampAndScale,
137	ConvertToInt32: fxConvertToInt32,
138	Store4:         opSrcStore4,
139	Store1:         opSrcStore1,
140}, {
141	LongName:       "fixedAccumulateMask",
142	ShortName:      "fxAccMask",
143	FrameSize:      fxFrameSize,
144	ArgsSize:       oneArgArgsSize,
145	Args:           "buf []uint32",
146	DstElemSize1:   1 * sizeOfUint32,
147	DstElemSize4:   4 * sizeOfUint32,
148	XMM3:           fxXMM3,
149	XMM4:           fxXMM4,
150	XMM5:           fxXMM5,
151	XMM6:           maskXMM6,
152	XMM8:           maskXMM8,
153	XMM9:           maskXMM9,
154	XMM10:          maskXMM10,
155	LoadArgs:       oneArgLoadArgs,
156	Setup:          fxSetup,
157	LoadXMMRegs:    fxLoadXMMRegs + "\n" + maskLoadXMMRegs,
158	Add:            fxAdd,
159	ClampAndScale:  fxClampAndScale,
160	ConvertToInt32: fxConvertToInt32,
161	Store4:         maskStore4,
162	Store1:         maskStore1,
163}, {
164	LongName:       "floatingAccumulateOpOver",
165	ShortName:      "flAccOpOver",
166	FrameSize:      flFrameSize,
167	ArgsSize:       twoArgArgsSize,
168	Args:           "dst []uint8, src []float32",
169	DstElemSize1:   1 * sizeOfUint8,
170	DstElemSize4:   4 * sizeOfUint8,
171	XMM3:           flXMM3,
172	XMM4:           flXMM4,
173	XMM5:           flXMM5,
174	XMM6:           opOverXMM6,
175	XMM8:           opOverXMM8,
176	XMM9:           opOverXMM9,
177	XMM10:          opOverXMM10,
178	LoadArgs:       twoArgLoadArgs,
179	Setup:          flSetup,
180	LoadXMMRegs:    flLoadXMMRegs + "\n" + opOverLoadXMMRegs,
181	Add:            flAdd,
182	ClampAndScale:  flClampAndScale,
183	ConvertToInt32: flConvertToInt32,
184	Store4:         opOverStore4,
185	Store1:         opOverStore1,
186}, {
187	LongName:       "floatingAccumulateOpSrc",
188	ShortName:      "flAccOpSrc",
189	FrameSize:      flFrameSize,
190	ArgsSize:       twoArgArgsSize,
191	Args:           "dst []uint8, src []float32",
192	DstElemSize1:   1 * sizeOfUint8,
193	DstElemSize4:   4 * sizeOfUint8,
194	XMM3:           flXMM3,
195	XMM4:           flXMM4,
196	XMM5:           flXMM5,
197	XMM6:           opSrcXMM6,
198	XMM8:           opSrcXMM8,
199	XMM9:           opSrcXMM9,
200	XMM10:          opSrcXMM10,
201	LoadArgs:       twoArgLoadArgs,
202	Setup:          flSetup,
203	LoadXMMRegs:    flLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
204	Add:            flAdd,
205	ClampAndScale:  flClampAndScale,
206	ConvertToInt32: flConvertToInt32,
207	Store4:         opSrcStore4,
208	Store1:         opSrcStore1,
209}, {
210	LongName:       "floatingAccumulateMask",
211	ShortName:      "flAccMask",
212	FrameSize:      flFrameSize,
213	ArgsSize:       twoArgArgsSize,
214	Args:           "dst []uint32, src []float32",
215	DstElemSize1:   1 * sizeOfUint32,
216	DstElemSize4:   4 * sizeOfUint32,
217	XMM3:           flXMM3,
218	XMM4:           flXMM4,
219	XMM5:           flXMM5,
220	XMM6:           maskXMM6,
221	XMM8:           maskXMM8,
222	XMM9:           maskXMM9,
223	XMM10:          maskXMM10,
224	LoadArgs:       twoArgLoadArgs,
225	Setup:          flSetup,
226	LoadXMMRegs:    flLoadXMMRegs + "\n" + maskLoadXMMRegs,
227	Add:            flAdd,
228	ClampAndScale:  flClampAndScale,
229	ConvertToInt32: flConvertToInt32,
230	Store4:         maskStore4,
231	Store1:         maskStore1,
232}}
233
234const (
235	fxFrameSize = `0`
236	flFrameSize = `8`
237
238	oneArgArgsSize = `24`
239	twoArgArgsSize = `48`
240
241	sizeOfUint8  = 1
242	sizeOfUint32 = 4
243
244	fxXMM3 = `-`
245	flXMM3 = `flSignMask`
246
247	fxXMM4 = `-`
248	flXMM4 = `flOne`
249
250	fxXMM5 = `fxAlmost65536`
251	flXMM5 = `flAlmost65536`
252
253	oneArgLoadArgs = `
254		MOVQ buf_base+0(FP), DI
255		MOVQ buf_len+8(FP), BX
256		MOVQ buf_base+0(FP), SI
257		MOVQ buf_len+8(FP), R10
258		`
259	twoArgLoadArgs = `
260		MOVQ dst_base+0(FP), DI
261		MOVQ dst_len+8(FP), BX
262		MOVQ src_base+24(FP), SI
263		MOVQ src_len+32(FP), R10
264		// Sanity check that len(dst) >= len(src).
265		CMPQ BX, R10
266		JLT  {{.ShortName}}End
267		`
268
269	fxSetup = ``
270	flSetup = `
271		// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
272		// "Round To Zero".
273		STMXCSR mxcsrOrig-8(SP)
274		MOVL    mxcsrOrig-8(SP), AX
275		ORL     $0x6000, AX
276		MOVL    AX, mxcsrNew-4(SP)
277		`
278
279	fxLoadXMMRegs = `
280		// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
281		MOVOU fxAlmost65536<>(SB), X5
282		`
283	flLoadXMMRegs = `
284		// flSignMask    := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
285		// flOne         := XMM(0x3f800000 repeated four times) // 1 as a float32.
286		// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
287		MOVOU flSignMask<>(SB), X3
288		MOVOU flOne<>(SB), X4
289		MOVOU flAlmost65536<>(SB), X5
290		`
291
292	fxAdd = `PADDD`
293	flAdd = `ADDPS`
294
295	fxClampAndScale = `
296		// y = abs(x)
297		// y >>= 2 // Shift by 2*ϕ - 16.
298		// y = min(y, fxAlmost65536)
299		PABSD  X1, X2
300		PSRLL  $2, X2
301		PMINUD X5, X2
302		`
303	flClampAndScale = `
304		// y = x & flSignMask
305		// y = min(y, flOne)
306		// y = mul(y, flAlmost65536)
307		MOVOU X3, X2
308		ANDPS X1, X2
309		MINPS X4, X2
310		MULPS X5, X2
311		`
312
313	fxConvertToInt32 = `
314		// z = convertToInt32(y)
315		// No-op.
316		`
317	flConvertToInt32 = `
318		// z = convertToInt32(y)
319		LDMXCSR  mxcsrNew-4(SP)
320		CVTPS2PL X2, X2
321		LDMXCSR  mxcsrOrig-8(SP)
322		`
323
324	opOverStore4 = `
325		// Blend over the dst's prior value. SIMD for i in 0..3:
326		//
327		// dstA := uint32(dst[i]) * 0x101
328		// maskA := z@i
329		// outA := dstA*(0xffff-maskA)/0xffff + maskA
330		// dst[i] = uint8(outA >> 8)
331		//
332		// First, set X0 to dstA*(0xfff-maskA).
333		MOVL   (DI), X0
334		PSHUFB X8, X0
335		MOVOU  X9, X11
336		PSUBL  X2, X11
337		PMULLD X11, X0
338		// We implement uint32 division by 0xffff as multiplication by a magic
339		// constant (0x800080001) and then a shift by a magic constant (47).
340		// See TestDivideByFFFF for a justification.
341		//
342		// That multiplication widens from uint32 to uint64, so we have to
343		// duplicate and shift our four uint32s from one XMM register (X0) to
344		// two XMM registers (X0 and X11).
345		//
346		// Move the second and fourth uint32s in X0 to be the first and third
347		// uint32s in X11.
348		MOVOU X0, X11
349		PSRLQ $32, X11
350		// Multiply by magic, shift by magic.
351		PMULULQ X10, X0
352		PMULULQ X10, X11
353		PSRLQ   $47, X0
354		PSRLQ   $47, X11
355		// Merge the two registers back to one, X11, and add maskA.
356		PSLLQ $32, X11
357		XORPS X0, X11
358		PADDD X11, X2
359		// As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
360		PSHUFB X6, X2
361		MOVL   X2, (DI)
362		`
363	opSrcStore4 = `
364		// z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
365		// copy(dst[:4], low4BytesOf(z))
366		PSHUFB X6, X2
367		MOVL   X2, (DI)
368		`
369	maskStore4 = `
370		// copy(dst[:4], z)
371		MOVOU X2, (DI)
372		`
373
374	opOverStore1 = `
375		// Blend over the dst's prior value.
376		//
377		// dstA := uint32(dst[0]) * 0x101
378		// maskA := z
379		// outA := dstA*(0xffff-maskA)/0xffff + maskA
380		// dst[0] = uint8(outA >> 8)
381		MOVBLZX (DI), R12
382		IMULL   $0x101, R12
383		MOVL    X2, R13
384		MOVL    $0xffff, AX
385		SUBL    R13, AX
386		MULL    R12             // MULL's implicit arg is AX, and the result is stored in DX:AX.
387		MOVL    $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
388		MULL    BX              // MULL's implicit arg is AX, and the result is stored in DX:AX.
389		SHRL    $15, DX         // ...and then shift by another magic constant (47 - 32 = 15).
390		ADDL    DX, R13
391		SHRL    $8, R13
392		MOVB    R13, (DI)
393		`
394	opSrcStore1 = `
395		// dst[0] = uint8(z>>8)
396		MOVL X2, BX
397		SHRL $8, BX
398		MOVB BX, (DI)
399		`
400	maskStore1 = `
401		// dst[0] = uint32(z)
402		MOVL X2, (DI)
403		`
404
405	opOverXMM6 = `gather`
406	opSrcXMM6  = `gather`
407	maskXMM6   = `-`
408
409	opOverXMM8 = `scatterAndMulBy0x101`
410	opSrcXMM8  = `-`
411	maskXMM8   = `-`
412
413	opOverXMM9 = `fxAlmost65536`
414	opSrcXMM9  = `-`
415	maskXMM9   = `-`
416
417	opOverXMM10 = `inverseFFFF`
418	opSrcXMM10  = `-`
419	maskXMM10   = `-`
420
421	opOverLoadXMMRegs = `
422		// gather               := XMM(see above)                      // PSHUFB shuffle mask.
423		// scatterAndMulBy0x101 := XMM(see above)                      // PSHUFB shuffle mask.
424		// fxAlmost65536        := XMM(0x0000ffff repeated four times) // 0xffff.
425		// inverseFFFF          := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
426		MOVOU gather<>(SB), X6
427		MOVOU scatterAndMulBy0x101<>(SB), X8
428		MOVOU fxAlmost65536<>(SB), X9
429		MOVOU inverseFFFF<>(SB), X10
430		`
431	opSrcLoadXMMRegs = `
432		// gather := XMM(see above) // PSHUFB shuffle mask.
433		MOVOU gather<>(SB), X6
434		`
435	maskLoadXMMRegs = ``
436)
437