1//+build !noasm,!appengine,gc
2
3// Copyright (c) 2020 MinIO Inc. All rights reserved.
4// Use of this source code is governed by a license that can be
5// found in the LICENSE file.
6
7package md5simd
8
9import (
10	"fmt"
11	"math"
12	"sync"
13	"unsafe"
14
15	"github.com/klauspost/cpuid"
16)
17
18var hasAVX512 bool
19
20//go:noescape
21func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
22
23//go:noescape
24func block16(state *uint32, base uintptr, ptrs *int32, mask uint64, n int)
25
26// 8-way 4x uint32 digests in 4 ymm registers
27// (ymm0, ymm1, ymm2, ymm3)
28type digest8 struct {
29	v0, v1, v2, v3 [8]uint32
30}
31
32// Stack cache for 8x64 byte md5.BlockSize bytes.
33// Must be 32-byte aligned, so allocate 512+32 and
34// align upwards at runtime.
35type cache8 [512 + 32]byte
36
37// MD5 magic numbers for one lane of hashing; inflated
38// 8x below at init time.
39var md5consts = [64]uint32{
40	0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
41	0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
42	0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
43	0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
44	0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
45	0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
46	0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
47	0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
48	0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
49	0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
50	0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
51	0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
52	0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
53	0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
54	0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
55	0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
56}
57
58// inflate the consts 8-way for 8x md5 (256 bit ymm registers)
59var avx256md5consts = func(c []uint32) []uint32 {
60	inf := make([]uint32, 8*len(c))
61	for i := range c {
62		for j := 0; j < 8; j++ {
63			inf[(i*8)+j] = c[i]
64		}
65	}
66	return inf
67}(md5consts[:])
68
69// 16-way 4x uint32 digests in 4 zmm registers
70type digest16 struct {
71	v0, v1, v2, v3 [16]uint32
72}
73
74// inflate the consts 16-way for 16x md5 (512 bit zmm registers)
75var avx512md5consts = func(c []uint32) []uint32 {
76	inf := make([]uint32, 16*len(c))
77	for i := range c {
78		for j := 0; j < 16; j++ {
79			inf[(i*16)+j] = c[i]
80		}
81	}
82	return inf
83}(md5consts[:])
84
85func init() {
86	hasAVX512 = cpuid.CPU.AVX512F()
87}
88
89// Interface function to assembly code
90func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
91	if hasAVX512 {
92		blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
93	} else {
94		d8a, d8b := digest8{}, digest8{}
95		for i := range d8a.v0 {
96			j := i + 8
97			d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
98			if !half {
99				d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
100			}
101		}
102
103		i8 := [2][8][]byte{}
104		for i := range i8[0] {
105			i8[0][i], i8[1][i] = input[i], input[8+i]
106		}
107		if half {
108			blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a)
109		} else {
110			wg := sync.WaitGroup{}
111			wg.Add(2)
112			go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }()
113			go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }()
114			wg.Wait()
115		}
116
117		for i := range d8a.v0 {
118			j := i + 8
119			d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i]
120			if !half {
121				d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i]
122			}
123		}
124	}
125}
126
127// Interface function to AVX512 assembly code
128func blockMd5_avx512(s *digest16, input [16][]byte, base []byte, maskRounds *[16]maskRounds) {
129	baseMin := uint64(uintptr(unsafe.Pointer(&(base[0]))))
130	ptrs := [16]int32{}
131
132	for i := range ptrs {
133		if len(input[i]) > 0 {
134			if len(input[i]) > internalBlockSize {
135				panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
136			}
137
138			off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
139			if off > math.MaxUint32 {
140				panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
141			}
142			ptrs[i] = int32(off)
143		}
144	}
145
146	sdup := *s // create copy of initial states to receive intermediate updates
147
148	rounds := generateMaskAndRounds16(input, maskRounds)
149
150	for r := 0; r < rounds; r++ {
151		m := maskRounds[r]
152
153		block16(&sdup.v0[0], uintptr(baseMin), &ptrs[0], m.mask, int(64*m.rounds))
154
155		for j := 0; j < len(ptrs); j++ {
156			ptrs[j] += int32(64 * m.rounds) // update pointers for next round
157			if m.mask&(1<<j) != 0 {         // update digest if still masked as active
158				(*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
159			}
160		}
161	}
162}
163
164// Interface function to AVX2 assembly code
165func blockMd5_avx2(s *digest8, input [8][]byte, base []byte, maskRounds *[8]maskRounds) {
166	baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) - 4
167	ptrs := [8]int32{}
168
169	for i := range ptrs {
170		if len(input[i]) > 0 {
171			if len(input[i]) > internalBlockSize {
172				panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
173			}
174
175			off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
176			if off > math.MaxUint32 {
177				panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
178			}
179			ptrs[i] = int32(off)
180		}
181	}
182
183	sdup := *s // create copy of initial states to receive intermediate updates
184
185	rounds := generateMaskAndRounds8(input, maskRounds)
186
187	for r := 0; r < rounds; r++ {
188		m := maskRounds[r]
189		var cache cache8 // stack storage for block8 tmp state
190		block8(&sdup.v0[0], uintptr(baseMin), &ptrs[0], &cache[0], int(64*m.rounds))
191
192		for j := 0; j < len(ptrs); j++ {
193			ptrs[j] += int32(64 * m.rounds) // update pointers for next round
194			if m.mask&(1<<j) != 0 {         // update digest if still masked as active
195				(*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
196			}
197		}
198	}
199}
200