1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ignore
6// +build ignore
7
8package main
9
10import (
11	"bytes"
12	"io/ioutil"
13	"log"
14	"strings"
15	"text/template"
16)
17
18const (
19	copyright = "" +
20		"// Copyright 2016 The Go Authors. All rights reserved.\n" +
21		"// Use of this source code is governed by a BSD-style\n" +
22		"// license that can be found in the LICENSE file.\n"
23
24	doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n"
25
26	dashDashDash = "// --------"
27)
28
29func main() {
30	tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl")
31	if err != nil {
32		log.Fatalf("ReadFile: %v", err)
33	}
34	if !bytes.HasPrefix(tmpl, []byte(copyright)) {
35		log.Fatal("source template did not start with the copyright header")
36	}
37	tmpl = tmpl[len(copyright):]
38
39	preamble := []byte(nil)
40	if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 {
41		log.Fatalf("source template did not contain %q", dashDashDash)
42	} else {
43		preamble, tmpl = tmpl[:i], tmpl[i:]
44	}
45
46	t, err := template.New("").Parse(string(tmpl))
47	if err != nil {
48		log.Fatalf("Parse: %v", err)
49	}
50
51	out := bytes.NewBuffer(nil)
52	out.WriteString(doNotEdit)
53	out.Write(preamble)
54
55	for i, v := range instances {
56		if i != 0 {
57			out.WriteString("\n")
58		}
59		if strings.Contains(v.LoadArgs, "{{.ShortName}}") {
60			v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1)
61		}
62		if err := t.Execute(out, v); err != nil {
63			log.Fatalf("Execute(%q): %v", v.ShortName, err)
64		}
65	}
66
67	if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil {
68		log.Fatalf("WriteFile: %v", err)
69	}
70}
71
72var instances = []struct {
73	LongName       string
74	ShortName      string
75	FrameSize      string
76	ArgsSize       string
77	Args           string
78	DstElemSize1   int
79	DstElemSize4   int
80	XMM3           string
81	XMM4           string
82	XMM5           string
83	XMM6           string
84	XMM8           string
85	XMM9           string
86	XMM10          string
87	LoadArgs       string
88	Setup          string
89	LoadXMMRegs    string
90	Add            string
91	ClampAndScale  string
92	ConvertToInt32 string
93	Store4         string
94	Store1         string
95}{{
96	LongName:       "fixedAccumulateOpOver",
97	ShortName:      "fxAccOpOver",
98	FrameSize:      fxFrameSize,
99	ArgsSize:       twoArgArgsSize,
100	Args:           "dst []uint8, src []uint32",
101	DstElemSize1:   1 * sizeOfUint8,
102	DstElemSize4:   4 * sizeOfUint8,
103	XMM3:           fxXMM3,
104	XMM4:           fxXMM4,
105	XMM5:           fxXMM5,
106	XMM6:           opOverXMM6,
107	XMM8:           opOverXMM8,
108	XMM9:           opOverXMM9,
109	XMM10:          opOverXMM10,
110	LoadArgs:       twoArgLoadArgs,
111	Setup:          fxSetup,
112	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opOverLoadXMMRegs,
113	Add:            fxAdd,
114	ClampAndScale:  fxClampAndScale,
115	ConvertToInt32: fxConvertToInt32,
116	Store4:         opOverStore4,
117	Store1:         opOverStore1,
118}, {
119	LongName:       "fixedAccumulateOpSrc",
120	ShortName:      "fxAccOpSrc",
121	FrameSize:      fxFrameSize,
122	ArgsSize:       twoArgArgsSize,
123	Args:           "dst []uint8, src []uint32",
124	DstElemSize1:   1 * sizeOfUint8,
125	DstElemSize4:   4 * sizeOfUint8,
126	XMM3:           fxXMM3,
127	XMM4:           fxXMM4,
128	XMM5:           fxXMM5,
129	XMM6:           opSrcXMM6,
130	XMM8:           opSrcXMM8,
131	XMM9:           opSrcXMM9,
132	XMM10:          opSrcXMM10,
133	LoadArgs:       twoArgLoadArgs,
134	Setup:          fxSetup,
135	LoadXMMRegs:    fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
136	Add:            fxAdd,
137	ClampAndScale:  fxClampAndScale,
138	ConvertToInt32: fxConvertToInt32,
139	Store4:         opSrcStore4,
140	Store1:         opSrcStore1,
141}, {
142	LongName:       "fixedAccumulateMask",
143	ShortName:      "fxAccMask",
144	FrameSize:      fxFrameSize,
145	ArgsSize:       oneArgArgsSize,
146	Args:           "buf []uint32",
147	DstElemSize1:   1 * sizeOfUint32,
148	DstElemSize4:   4 * sizeOfUint32,
149	XMM3:           fxXMM3,
150	XMM4:           fxXMM4,
151	XMM5:           fxXMM5,
152	XMM6:           maskXMM6,
153	XMM8:           maskXMM8,
154	XMM9:           maskXMM9,
155	XMM10:          maskXMM10,
156	LoadArgs:       oneArgLoadArgs,
157	Setup:          fxSetup,
158	LoadXMMRegs:    fxLoadXMMRegs + "\n" + maskLoadXMMRegs,
159	Add:            fxAdd,
160	ClampAndScale:  fxClampAndScale,
161	ConvertToInt32: fxConvertToInt32,
162	Store4:         maskStore4,
163	Store1:         maskStore1,
164}, {
165	LongName:       "floatingAccumulateOpOver",
166	ShortName:      "flAccOpOver",
167	FrameSize:      flFrameSize,
168	ArgsSize:       twoArgArgsSize,
169	Args:           "dst []uint8, src []float32",
170	DstElemSize1:   1 * sizeOfUint8,
171	DstElemSize4:   4 * sizeOfUint8,
172	XMM3:           flXMM3,
173	XMM4:           flXMM4,
174	XMM5:           flXMM5,
175	XMM6:           opOverXMM6,
176	XMM8:           opOverXMM8,
177	XMM9:           opOverXMM9,
178	XMM10:          opOverXMM10,
179	LoadArgs:       twoArgLoadArgs,
180	Setup:          flSetup,
181	LoadXMMRegs:    flLoadXMMRegs + "\n" + opOverLoadXMMRegs,
182	Add:            flAdd,
183	ClampAndScale:  flClampAndScale,
184	ConvertToInt32: flConvertToInt32,
185	Store4:         opOverStore4,
186	Store1:         opOverStore1,
187}, {
188	LongName:       "floatingAccumulateOpSrc",
189	ShortName:      "flAccOpSrc",
190	FrameSize:      flFrameSize,
191	ArgsSize:       twoArgArgsSize,
192	Args:           "dst []uint8, src []float32",
193	DstElemSize1:   1 * sizeOfUint8,
194	DstElemSize4:   4 * sizeOfUint8,
195	XMM3:           flXMM3,
196	XMM4:           flXMM4,
197	XMM5:           flXMM5,
198	XMM6:           opSrcXMM6,
199	XMM8:           opSrcXMM8,
200	XMM9:           opSrcXMM9,
201	XMM10:          opSrcXMM10,
202	LoadArgs:       twoArgLoadArgs,
203	Setup:          flSetup,
204	LoadXMMRegs:    flLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
205	Add:            flAdd,
206	ClampAndScale:  flClampAndScale,
207	ConvertToInt32: flConvertToInt32,
208	Store4:         opSrcStore4,
209	Store1:         opSrcStore1,
210}, {
211	LongName:       "floatingAccumulateMask",
212	ShortName:      "flAccMask",
213	FrameSize:      flFrameSize,
214	ArgsSize:       twoArgArgsSize,
215	Args:           "dst []uint32, src []float32",
216	DstElemSize1:   1 * sizeOfUint32,
217	DstElemSize4:   4 * sizeOfUint32,
218	XMM3:           flXMM3,
219	XMM4:           flXMM4,
220	XMM5:           flXMM5,
221	XMM6:           maskXMM6,
222	XMM8:           maskXMM8,
223	XMM9:           maskXMM9,
224	XMM10:          maskXMM10,
225	LoadArgs:       twoArgLoadArgs,
226	Setup:          flSetup,
227	LoadXMMRegs:    flLoadXMMRegs + "\n" + maskLoadXMMRegs,
228	Add:            flAdd,
229	ClampAndScale:  flClampAndScale,
230	ConvertToInt32: flConvertToInt32,
231	Store4:         maskStore4,
232	Store1:         maskStore1,
233}}
234
235const (
236	fxFrameSize = `0`
237	flFrameSize = `8`
238
239	oneArgArgsSize = `24`
240	twoArgArgsSize = `48`
241
242	sizeOfUint8  = 1
243	sizeOfUint32 = 4
244
245	fxXMM3 = `-`
246	flXMM3 = `flSignMask`
247
248	fxXMM4 = `-`
249	flXMM4 = `flOne`
250
251	fxXMM5 = `fxAlmost65536`
252	flXMM5 = `flAlmost65536`
253
254	oneArgLoadArgs = `
255		MOVQ buf_base+0(FP), DI
256		MOVQ buf_len+8(FP), BX
257		MOVQ buf_base+0(FP), SI
258		MOVQ buf_len+8(FP), R10
259		`
260	twoArgLoadArgs = `
261		MOVQ dst_base+0(FP), DI
262		MOVQ dst_len+8(FP), BX
263		MOVQ src_base+24(FP), SI
264		MOVQ src_len+32(FP), R10
265		// Sanity check that len(dst) >= len(src).
266		CMPQ BX, R10
267		JLT  {{.ShortName}}End
268		`
269
270	fxSetup = ``
271	flSetup = `
272		// Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
273		// "Round To Zero".
274		STMXCSR mxcsrOrig-8(SP)
275		MOVL    mxcsrOrig-8(SP), AX
276		ORL     $0x6000, AX
277		MOVL    AX, mxcsrNew-4(SP)
278		`
279
280	fxLoadXMMRegs = `
281		// fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
282		MOVOU fxAlmost65536<>(SB), X5
283		`
284	flLoadXMMRegs = `
285		// flSignMask    := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
286		// flOne         := XMM(0x3f800000 repeated four times) // 1 as a float32.
287		// flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
288		MOVOU flSignMask<>(SB), X3
289		MOVOU flOne<>(SB), X4
290		MOVOU flAlmost65536<>(SB), X5
291		`
292
293	fxAdd = `PADDD`
294	flAdd = `ADDPS`
295
296	fxClampAndScale = `
297		// y = abs(x)
298		// y >>= 2 // Shift by 2*ϕ - 16.
299		// y = min(y, fxAlmost65536)
300		PABSD  X1, X2
301		PSRLL  $2, X2
302		PMINUD X5, X2
303		`
304	flClampAndScale = `
305		// y = x & flSignMask
306		// y = min(y, flOne)
307		// y = mul(y, flAlmost65536)
308		MOVOU X3, X2
309		ANDPS X1, X2
310		MINPS X4, X2
311		MULPS X5, X2
312		`
313
314	fxConvertToInt32 = `
315		// z = convertToInt32(y)
316		// No-op.
317		`
318	flConvertToInt32 = `
319		// z = convertToInt32(y)
320		LDMXCSR  mxcsrNew-4(SP)
321		CVTPS2PL X2, X2
322		LDMXCSR  mxcsrOrig-8(SP)
323		`
324
325	opOverStore4 = `
326		// Blend over the dst's prior value. SIMD for i in 0..3:
327		//
328		// dstA := uint32(dst[i]) * 0x101
329		// maskA := z@i
330		// outA := dstA*(0xffff-maskA)/0xffff + maskA
331		// dst[i] = uint8(outA >> 8)
332		//
333		// First, set X0 to dstA*(0xfff-maskA).
334		MOVL   (DI), X0
335		PSHUFB X8, X0
336		MOVOU  X9, X11
337		PSUBL  X2, X11
338		PMULLD X11, X0
339		// We implement uint32 division by 0xffff as multiplication by a magic
340		// constant (0x800080001) and then a shift by a magic constant (47).
341		// See TestDivideByFFFF for a justification.
342		//
343		// That multiplication widens from uint32 to uint64, so we have to
344		// duplicate and shift our four uint32s from one XMM register (X0) to
345		// two XMM registers (X0 and X11).
346		//
347		// Move the second and fourth uint32s in X0 to be the first and third
348		// uint32s in X11.
349		MOVOU X0, X11
350		PSRLQ $32, X11
351		// Multiply by magic, shift by magic.
352		PMULULQ X10, X0
353		PMULULQ X10, X11
354		PSRLQ   $47, X0
355		PSRLQ   $47, X11
356		// Merge the two registers back to one, X11, and add maskA.
357		PSLLQ $32, X11
358		XORPS X0, X11
359		PADDD X11, X2
360		// As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
361		PSHUFB X6, X2
362		MOVL   X2, (DI)
363		`
364	opSrcStore4 = `
365		// z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
366		// copy(dst[:4], low4BytesOf(z))
367		PSHUFB X6, X2
368		MOVL   X2, (DI)
369		`
370	maskStore4 = `
371		// copy(dst[:4], z)
372		MOVOU X2, (DI)
373		`
374
375	opOverStore1 = `
376		// Blend over the dst's prior value.
377		//
378		// dstA := uint32(dst[0]) * 0x101
379		// maskA := z
380		// outA := dstA*(0xffff-maskA)/0xffff + maskA
381		// dst[0] = uint8(outA >> 8)
382		MOVBLZX (DI), R12
383		IMULL   $0x101, R12
384		MOVL    X2, R13
385		MOVL    $0xffff, AX
386		SUBL    R13, AX
387		MULL    R12             // MULL's implicit arg is AX, and the result is stored in DX:AX.
388		MOVL    $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
389		MULL    BX              // MULL's implicit arg is AX, and the result is stored in DX:AX.
390		SHRL    $15, DX         // ...and then shift by another magic constant (47 - 32 = 15).
391		ADDL    DX, R13
392		SHRL    $8, R13
393		MOVB    R13, (DI)
394		`
395	opSrcStore1 = `
396		// dst[0] = uint8(z>>8)
397		MOVL X2, BX
398		SHRL $8, BX
399		MOVB BX, (DI)
400		`
401	maskStore1 = `
402		// dst[0] = uint32(z)
403		MOVL X2, (DI)
404		`
405
406	opOverXMM6 = `gather`
407	opSrcXMM6  = `gather`
408	maskXMM6   = `-`
409
410	opOverXMM8 = `scatterAndMulBy0x101`
411	opSrcXMM8  = `-`
412	maskXMM8   = `-`
413
414	opOverXMM9 = `fxAlmost65536`
415	opSrcXMM9  = `-`
416	maskXMM9   = `-`
417
418	opOverXMM10 = `inverseFFFF`
419	opSrcXMM10  = `-`
420	maskXMM10   = `-`
421
422	opOverLoadXMMRegs = `
423		// gather               := XMM(see above)                      // PSHUFB shuffle mask.
424		// scatterAndMulBy0x101 := XMM(see above)                      // PSHUFB shuffle mask.
425		// fxAlmost65536        := XMM(0x0000ffff repeated four times) // 0xffff.
426		// inverseFFFF          := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
427		MOVOU gather<>(SB), X6
428		MOVOU scatterAndMulBy0x101<>(SB), X8
429		MOVOU fxAlmost65536<>(SB), X9
430		MOVOU inverseFFFF<>(SB), X10
431		`
432	opSrcLoadXMMRegs = `
433		// gather := XMM(see above) // PSHUFB shuffle mask.
434		MOVOU gather<>(SB), X6
435		`
436	maskLoadXMMRegs = ``
437)
438