1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build ignore 6// +build ignore 7 8package main 9 10import ( 11 "bytes" 12 "io/ioutil" 13 "log" 14 "strings" 15 "text/template" 16) 17 18const ( 19 copyright = "" + 20 "// Copyright 2016 The Go Authors. All rights reserved.\n" + 21 "// Use of this source code is governed by a BSD-style\n" + 22 "// license that can be found in the LICENSE file.\n" 23 24 doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n" 25 26 dashDashDash = "// --------" 27) 28 29func main() { 30 tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl") 31 if err != nil { 32 log.Fatalf("ReadFile: %v", err) 33 } 34 if !bytes.HasPrefix(tmpl, []byte(copyright)) { 35 log.Fatal("source template did not start with the copyright header") 36 } 37 tmpl = tmpl[len(copyright):] 38 39 preamble := []byte(nil) 40 if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 { 41 log.Fatalf("source template did not contain %q", dashDashDash) 42 } else { 43 preamble, tmpl = tmpl[:i], tmpl[i:] 44 } 45 46 t, err := template.New("").Parse(string(tmpl)) 47 if err != nil { 48 log.Fatalf("Parse: %v", err) 49 } 50 51 out := bytes.NewBuffer(nil) 52 out.WriteString(doNotEdit) 53 out.Write(preamble) 54 55 for i, v := range instances { 56 if i != 0 { 57 out.WriteString("\n") 58 } 59 if strings.Contains(v.LoadArgs, "{{.ShortName}}") { 60 v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1) 61 } 62 if err := t.Execute(out, v); err != nil { 63 log.Fatalf("Execute(%q): %v", v.ShortName, err) 64 } 65 } 66 67 if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil { 68 log.Fatalf("WriteFile: %v", err) 69 } 70} 71 72var instances = []struct { 73 LongName string 74 ShortName string 75 FrameSize string 76 ArgsSize string 77 Args string 78 DstElemSize1 int 79 DstElemSize4 int 80 XMM3 string 81 XMM4 string 82 XMM5 string 83 XMM6 string 84 XMM8 string 85 XMM9 string 86 XMM10 string 87 LoadArgs string 88 Setup string 89 LoadXMMRegs string 90 Add string 91 ClampAndScale string 92 ConvertToInt32 string 93 Store4 string 94 Store1 string 95}{{ 96 LongName: "fixedAccumulateOpOver", 97 ShortName: "fxAccOpOver", 98 FrameSize: fxFrameSize, 99 ArgsSize: twoArgArgsSize, 100 Args: "dst []uint8, src []uint32", 101 DstElemSize1: 1 * sizeOfUint8, 102 DstElemSize4: 4 * sizeOfUint8, 103 XMM3: fxXMM3, 104 XMM4: fxXMM4, 105 XMM5: fxXMM5, 106 XMM6: opOverXMM6, 107 XMM8: opOverXMM8, 108 XMM9: opOverXMM9, 109 XMM10: opOverXMM10, 110 LoadArgs: twoArgLoadArgs, 111 Setup: fxSetup, 112 LoadXMMRegs: fxLoadXMMRegs + "\n" + opOverLoadXMMRegs, 113 Add: fxAdd, 114 ClampAndScale: fxClampAndScale, 115 ConvertToInt32: fxConvertToInt32, 116 Store4: opOverStore4, 117 Store1: opOverStore1, 118}, { 119 LongName: "fixedAccumulateOpSrc", 120 ShortName: "fxAccOpSrc", 121 FrameSize: fxFrameSize, 122 ArgsSize: twoArgArgsSize, 123 Args: "dst []uint8, src []uint32", 124 DstElemSize1: 1 * sizeOfUint8, 125 DstElemSize4: 4 * sizeOfUint8, 126 XMM3: fxXMM3, 127 XMM4: fxXMM4, 128 XMM5: fxXMM5, 129 XMM6: opSrcXMM6, 130 XMM8: opSrcXMM8, 131 XMM9: opSrcXMM9, 132 XMM10: opSrcXMM10, 133 LoadArgs: twoArgLoadArgs, 134 Setup: fxSetup, 135 LoadXMMRegs: fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs, 136 Add: fxAdd, 137 ClampAndScale: fxClampAndScale, 138 ConvertToInt32: fxConvertToInt32, 139 Store4: opSrcStore4, 140 Store1: opSrcStore1, 141}, { 142 LongName: "fixedAccumulateMask", 143 ShortName: "fxAccMask", 144 FrameSize: fxFrameSize, 145 ArgsSize: oneArgArgsSize, 146 Args: "buf []uint32", 147 DstElemSize1: 1 * sizeOfUint32, 148 DstElemSize4: 4 * sizeOfUint32, 149 XMM3: fxXMM3, 150 XMM4: fxXMM4, 151 XMM5: fxXMM5, 152 XMM6: maskXMM6, 153 XMM8: maskXMM8, 154 XMM9: maskXMM9, 155 XMM10: maskXMM10, 156 LoadArgs: oneArgLoadArgs, 157 Setup: fxSetup, 158 LoadXMMRegs: fxLoadXMMRegs + "\n" + maskLoadXMMRegs, 159 Add: fxAdd, 160 ClampAndScale: fxClampAndScale, 161 ConvertToInt32: fxConvertToInt32, 162 Store4: maskStore4, 163 Store1: maskStore1, 164}, { 165 LongName: "floatingAccumulateOpOver", 166 ShortName: "flAccOpOver", 167 FrameSize: flFrameSize, 168 ArgsSize: twoArgArgsSize, 169 Args: "dst []uint8, src []float32", 170 DstElemSize1: 1 * sizeOfUint8, 171 DstElemSize4: 4 * sizeOfUint8, 172 XMM3: flXMM3, 173 XMM4: flXMM4, 174 XMM5: flXMM5, 175 XMM6: opOverXMM6, 176 XMM8: opOverXMM8, 177 XMM9: opOverXMM9, 178 XMM10: opOverXMM10, 179 LoadArgs: twoArgLoadArgs, 180 Setup: flSetup, 181 LoadXMMRegs: flLoadXMMRegs + "\n" + opOverLoadXMMRegs, 182 Add: flAdd, 183 ClampAndScale: flClampAndScale, 184 ConvertToInt32: flConvertToInt32, 185 Store4: opOverStore4, 186 Store1: opOverStore1, 187}, { 188 LongName: "floatingAccumulateOpSrc", 189 ShortName: "flAccOpSrc", 190 FrameSize: flFrameSize, 191 ArgsSize: twoArgArgsSize, 192 Args: "dst []uint8, src []float32", 193 DstElemSize1: 1 * sizeOfUint8, 194 DstElemSize4: 4 * sizeOfUint8, 195 XMM3: flXMM3, 196 XMM4: flXMM4, 197 XMM5: flXMM5, 198 XMM6: opSrcXMM6, 199 XMM8: opSrcXMM8, 200 XMM9: opSrcXMM9, 201 XMM10: opSrcXMM10, 202 LoadArgs: twoArgLoadArgs, 203 Setup: flSetup, 204 LoadXMMRegs: flLoadXMMRegs + "\n" + opSrcLoadXMMRegs, 205 Add: flAdd, 206 ClampAndScale: flClampAndScale, 207 ConvertToInt32: flConvertToInt32, 208 Store4: opSrcStore4, 209 Store1: opSrcStore1, 210}, { 211 LongName: "floatingAccumulateMask", 212 ShortName: "flAccMask", 213 FrameSize: flFrameSize, 214 ArgsSize: twoArgArgsSize, 215 Args: "dst []uint32, src []float32", 216 DstElemSize1: 1 * sizeOfUint32, 217 DstElemSize4: 4 * sizeOfUint32, 218 XMM3: flXMM3, 219 XMM4: flXMM4, 220 XMM5: flXMM5, 221 XMM6: maskXMM6, 222 XMM8: maskXMM8, 223 XMM9: maskXMM9, 224 XMM10: maskXMM10, 225 LoadArgs: twoArgLoadArgs, 226 Setup: flSetup, 227 LoadXMMRegs: flLoadXMMRegs + "\n" + maskLoadXMMRegs, 228 Add: flAdd, 229 ClampAndScale: flClampAndScale, 230 ConvertToInt32: flConvertToInt32, 231 Store4: maskStore4, 232 Store1: maskStore1, 233}} 234 235const ( 236 fxFrameSize = `0` 237 flFrameSize = `8` 238 239 oneArgArgsSize = `24` 240 twoArgArgsSize = `48` 241 242 sizeOfUint8 = 1 243 sizeOfUint32 = 4 244 245 fxXMM3 = `-` 246 flXMM3 = `flSignMask` 247 248 fxXMM4 = `-` 249 flXMM4 = `flOne` 250 251 fxXMM5 = `fxAlmost65536` 252 flXMM5 = `flAlmost65536` 253 254 oneArgLoadArgs = ` 255 MOVQ buf_base+0(FP), DI 256 MOVQ buf_len+8(FP), BX 257 MOVQ buf_base+0(FP), SI 258 MOVQ buf_len+8(FP), R10 259 ` 260 twoArgLoadArgs = ` 261 MOVQ dst_base+0(FP), DI 262 MOVQ dst_len+8(FP), BX 263 MOVQ src_base+24(FP), SI 264 MOVQ src_len+32(FP), R10 265 // Sanity check that len(dst) >= len(src). 266 CMPQ BX, R10 267 JLT {{.ShortName}}End 268 ` 269 270 fxSetup = `` 271 flSetup = ` 272 // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is 273 // "Round To Zero". 274 STMXCSR mxcsrOrig-8(SP) 275 MOVL mxcsrOrig-8(SP), AX 276 ORL $0x6000, AX 277 MOVL AX, mxcsrNew-4(SP) 278 ` 279 280 fxLoadXMMRegs = ` 281 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. 282 MOVOU fxAlmost65536<>(SB), X5 283 ` 284 flLoadXMMRegs = ` 285 // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. 286 // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. 287 // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. 288 MOVOU flSignMask<>(SB), X3 289 MOVOU flOne<>(SB), X4 290 MOVOU flAlmost65536<>(SB), X5 291 ` 292 293 fxAdd = `PADDD` 294 flAdd = `ADDPS` 295 296 fxClampAndScale = ` 297 // y = abs(x) 298 // y >>= 2 // Shift by 2*ϕ - 16. 299 // y = min(y, fxAlmost65536) 300 PABSD X1, X2 301 PSRLL $2, X2 302 PMINUD X5, X2 303 ` 304 flClampAndScale = ` 305 // y = x & flSignMask 306 // y = min(y, flOne) 307 // y = mul(y, flAlmost65536) 308 MOVOU X3, X2 309 ANDPS X1, X2 310 MINPS X4, X2 311 MULPS X5, X2 312 ` 313 314 fxConvertToInt32 = ` 315 // z = convertToInt32(y) 316 // No-op. 317 ` 318 flConvertToInt32 = ` 319 // z = convertToInt32(y) 320 LDMXCSR mxcsrNew-4(SP) 321 CVTPS2PL X2, X2 322 LDMXCSR mxcsrOrig-8(SP) 323 ` 324 325 opOverStore4 = ` 326 // Blend over the dst's prior value. SIMD for i in 0..3: 327 // 328 // dstA := uint32(dst[i]) * 0x101 329 // maskA := z@i 330 // outA := dstA*(0xffff-maskA)/0xffff + maskA 331 // dst[i] = uint8(outA >> 8) 332 // 333 // First, set X0 to dstA*(0xfff-maskA). 334 MOVL (DI), X0 335 PSHUFB X8, X0 336 MOVOU X9, X11 337 PSUBL X2, X11 338 PMULLD X11, X0 339 // We implement uint32 division by 0xffff as multiplication by a magic 340 // constant (0x800080001) and then a shift by a magic constant (47). 341 // See TestDivideByFFFF for a justification. 342 // 343 // That multiplication widens from uint32 to uint64, so we have to 344 // duplicate and shift our four uint32s from one XMM register (X0) to 345 // two XMM registers (X0 and X11). 346 // 347 // Move the second and fourth uint32s in X0 to be the first and third 348 // uint32s in X11. 349 MOVOU X0, X11 350 PSRLQ $32, X11 351 // Multiply by magic, shift by magic. 352 PMULULQ X10, X0 353 PMULULQ X10, X11 354 PSRLQ $47, X0 355 PSRLQ $47, X11 356 // Merge the two registers back to one, X11, and add maskA. 357 PSLLQ $32, X11 358 XORPS X0, X11 359 PADDD X11, X2 360 // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes. 361 PSHUFB X6, X2 362 MOVL X2, (DI) 363 ` 364 opSrcStore4 = ` 365 // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z) 366 // copy(dst[:4], low4BytesOf(z)) 367 PSHUFB X6, X2 368 MOVL X2, (DI) 369 ` 370 maskStore4 = ` 371 // copy(dst[:4], z) 372 MOVOU X2, (DI) 373 ` 374 375 opOverStore1 = ` 376 // Blend over the dst's prior value. 377 // 378 // dstA := uint32(dst[0]) * 0x101 379 // maskA := z 380 // outA := dstA*(0xffff-maskA)/0xffff + maskA 381 // dst[0] = uint8(outA >> 8) 382 MOVBLZX (DI), R12 383 IMULL $0x101, R12 384 MOVL X2, R13 385 MOVL $0xffff, AX 386 SUBL R13, AX 387 MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. 388 MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... 389 MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. 390 SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). 391 ADDL DX, R13 392 SHRL $8, R13 393 MOVB R13, (DI) 394 ` 395 opSrcStore1 = ` 396 // dst[0] = uint8(z>>8) 397 MOVL X2, BX 398 SHRL $8, BX 399 MOVB BX, (DI) 400 ` 401 maskStore1 = ` 402 // dst[0] = uint32(z) 403 MOVL X2, (DI) 404 ` 405 406 opOverXMM6 = `gather` 407 opSrcXMM6 = `gather` 408 maskXMM6 = `-` 409 410 opOverXMM8 = `scatterAndMulBy0x101` 411 opSrcXMM8 = `-` 412 maskXMM8 = `-` 413 414 opOverXMM9 = `fxAlmost65536` 415 opSrcXMM9 = `-` 416 maskXMM9 = `-` 417 418 opOverXMM10 = `inverseFFFF` 419 opSrcXMM10 = `-` 420 maskXMM10 = `-` 421 422 opOverLoadXMMRegs = ` 423 // gather := XMM(see above) // PSHUFB shuffle mask. 424 // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. 425 // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. 426 // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. 427 MOVOU gather<>(SB), X6 428 MOVOU scatterAndMulBy0x101<>(SB), X8 429 MOVOU fxAlmost65536<>(SB), X9 430 MOVOU inverseFFFF<>(SB), X10 431 ` 432 opSrcLoadXMMRegs = ` 433 // gather := XMM(see above) // PSHUFB shuffle mask. 434 MOVOU gather<>(SB), X6 435 ` 436 maskLoadXMMRegs = `` 437) 438