1//+build !noasm,!appengine
2
3/*
4 * Minio Cloud Storage, (C) 2017 Minio, Inc.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 *     http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19package sha256
20
21import (
22	"encoding/binary"
23	"errors"
24	"hash"
25	"sort"
26	"sync/atomic"
27	"time"
28)
29
30//go:noescape
31func sha256X16Avx512(digests *[512]byte, scratch *[512]byte, table *[512]uint64, mask []uint64, inputs [16][]byte)
32
33// Avx512ServerUID - Do not start at 0 but next multiple of 16 so as to be able to
34// differentiate with default initialiation value of 0
35const Avx512ServerUID = 16
36
37var uidCounter uint64
38
39// NewAvx512 - initialize sha256 Avx512 implementation.
40func NewAvx512(a512srv *Avx512Server) hash.Hash {
41	uid := atomic.AddUint64(&uidCounter, 1)
42	return &Avx512Digest{uid: uid, a512srv: a512srv}
43}
44
45// Avx512Digest - Type for computing SHA256 using Avx512
46type Avx512Digest struct {
47	uid     uint64
48	a512srv *Avx512Server
49	x       [chunk]byte
50	nx      int
51	len     uint64
52	final   bool
53	result  [Size]byte
54}
55
56// Size - Return size of checksum
57func (d *Avx512Digest) Size() int { return Size }
58
59// BlockSize - Return blocksize of checksum
60func (d Avx512Digest) BlockSize() int { return BlockSize }
61
62// Reset - reset sha digest to its initial values
63func (d *Avx512Digest) Reset() {
64	d.a512srv.blocksCh <- blockInput{uid: d.uid, reset: true}
65	d.nx = 0
66	d.len = 0
67	d.final = false
68}
69
70// Write to digest
71func (d *Avx512Digest) Write(p []byte) (nn int, err error) {
72
73	if d.final {
74		return 0, errors.New("Avx512Digest already finalized. Reset first before writing again")
75	}
76
77	nn = len(p)
78	d.len += uint64(nn)
79	if d.nx > 0 {
80		n := copy(d.x[d.nx:], p)
81		d.nx += n
82		if d.nx == chunk {
83			d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: d.x[:]}
84			d.nx = 0
85		}
86		p = p[n:]
87	}
88	if len(p) >= chunk {
89		n := len(p) &^ (chunk - 1)
90		d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: p[:n]}
91		p = p[n:]
92	}
93	if len(p) > 0 {
94		d.nx = copy(d.x[:], p)
95	}
96	return
97}
98
99// Sum - Return sha256 sum in bytes
100func (d *Avx512Digest) Sum(in []byte) (result []byte) {
101
102	if d.final {
103		return append(in, d.result[:]...)
104	}
105
106	trail := make([]byte, 0, 128)
107	trail = append(trail, d.x[:d.nx]...)
108
109	len := d.len
110	// Padding.  Add a 1 bit and 0 bits until 56 bytes mod 64.
111	var tmp [64]byte
112	tmp[0] = 0x80
113	if len%64 < 56 {
114		trail = append(trail, tmp[0:56-len%64]...)
115	} else {
116		trail = append(trail, tmp[0:64+56-len%64]...)
117	}
118	d.nx = 0
119
120	// Length in bits.
121	len <<= 3
122	for i := uint(0); i < 8; i++ {
123		tmp[i] = byte(len >> (56 - 8*i))
124	}
125	trail = append(trail, tmp[0:8]...)
126
127	sumCh := make(chan [Size]byte)
128	d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: trail, final: true, sumCh: sumCh}
129	d.result = <-sumCh
130	d.final = true
131	return append(in, d.result[:]...)
132}
133
134var table = [512]uint64{
135	0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
136	0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
137	0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
138	0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
139	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
140	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
141	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
142	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
143	0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
144	0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
145	0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
146	0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
147	0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
148	0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
149	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
150	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
151	0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
152	0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
153	0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
154	0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
155	0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
156	0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
157	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
158	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
159	0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
160	0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
161	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
162	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
163	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
164	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
165	0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
166	0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
167	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
168	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
169	0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
170	0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
171	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
172	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
173	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
174	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
175	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
176	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
177	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
178	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
179	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
180	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
181	0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
182	0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
183	0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
184	0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
185	0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
186	0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
187	0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
188	0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
189	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
190	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
191	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
192	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
193	0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
194	0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
195	0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
196	0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
197	0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
198	0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
199	0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
200	0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
201	0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
202	0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
203	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
204	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
205	0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
206	0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
207	0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
208	0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
209	0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
210	0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
211	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
212	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
213	0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
214	0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
215	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
216	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
217	0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
218	0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
219	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
220	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
221	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
222	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
223	0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
224	0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
225	0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
226	0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
227	0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
228	0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
229	0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
230	0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
231	0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
232	0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
233	0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
234	0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
235	0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
236	0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
237	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
238	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
239	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
240	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
241	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
242	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
243	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
244	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
245	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
246	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
247	0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
248	0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
249	0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
250	0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
251	0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
252	0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
253	0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
254	0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
255	0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
256	0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
257	0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
258	0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
259	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
260	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
261	0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2,
262	0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2}
263
264// Interface function to assembly ode
265func blockAvx512(digests *[512]byte, input [16][]byte, mask []uint64) [16][Size]byte {
266
267	scratch := [512]byte{}
268	sha256X16Avx512(digests, &scratch, &table, mask, input)
269
270	output := [16][Size]byte{}
271	for i := 0; i < 16; i++ {
272		output[i] = getDigest(i, digests[:])
273	}
274
275	return output
276}
277
278func getDigest(index int, state []byte) (sum [Size]byte) {
279	for j := 0; j < 16; j += 2 {
280		for i := index*4 + j*Size; i < index*4+(j+1)*Size; i += Size {
281			binary.BigEndian.PutUint32(sum[j*2:], binary.LittleEndian.Uint32(state[i:i+4]))
282		}
283	}
284	return
285}
286
287// Message to send across input channel
288type blockInput struct {
289	uid   uint64
290	msg   []byte
291	reset bool
292	final bool
293	sumCh chan [Size]byte
294}
295
296// Avx512Server - Type to implement 16x parallel handling of SHA256 invocations
297type Avx512Server struct {
298	blocksCh chan blockInput       // Input channel
299	totalIn  int                   // Total number of inputs waiting to be processed
300	lanes    [16]Avx512LaneInfo    // Array with info per lane (out of 16)
301	digests  map[uint64][Size]byte // Map of uids to (interim) digest results
302}
303
304// Avx512LaneInfo - Info for each lane
305type Avx512LaneInfo struct {
306	uid      uint64          // unique identification for this SHA processing
307	block    []byte          // input block to be processed
308	outputCh chan [Size]byte // channel for output result
309}
310
311// NewAvx512Server - Create new object for parallel processing handling
312func NewAvx512Server() *Avx512Server {
313	a512srv := &Avx512Server{}
314	a512srv.digests = make(map[uint64][Size]byte)
315	a512srv.blocksCh = make(chan blockInput)
316
317	// Start a single thread for reading from the input channel
318	go a512srv.Process()
319	return a512srv
320}
321
322// Process - Sole handler for reading from the input channel
323func (a512srv *Avx512Server) Process() {
324	for {
325		select {
326		case block := <-a512srv.blocksCh:
327			if block.reset {
328				a512srv.reset(block.uid)
329				continue
330			}
331			index := block.uid & 0xf
332			// fmt.Println("Adding message:", block.uid, index)
333
334			if a512srv.lanes[index].block != nil { // If slot is already filled, process all inputs
335				//fmt.Println("Invoking Blocks()")
336				a512srv.blocks()
337			}
338			a512srv.totalIn++
339			a512srv.lanes[index] = Avx512LaneInfo{uid: block.uid, block: block.msg}
340			if block.final {
341				a512srv.lanes[index].outputCh = block.sumCh
342			}
343			if a512srv.totalIn == len(a512srv.lanes) {
344				// fmt.Println("Invoking Blocks() while FULL: ")
345				a512srv.blocks()
346			}
347
348			// TODO: test with larger timeout
349		case <-time.After(1 * time.Microsecond):
350			for _, lane := range a512srv.lanes {
351				if lane.block != nil { // check if there is any input to process
352					// fmt.Println("Invoking Blocks() on TIMEOUT: ")
353					a512srv.blocks()
354					break // we are done
355				}
356			}
357		}
358	}
359}
360
361// Do a reset for this calculation
362func (a512srv *Avx512Server) reset(uid uint64) {
363
364	// Check if there is a message still waiting to be processed (and remove if so)
365	for i, lane := range a512srv.lanes {
366		if lane.uid == uid {
367			if lane.block != nil {
368				a512srv.lanes[i] = Avx512LaneInfo{} // clear message
369				a512srv.totalIn--
370			}
371		}
372	}
373
374	// Delete entry from hash map
375	delete(a512srv.digests, uid)
376}
377
378// Invoke assembly and send results back
379func (a512srv *Avx512Server) blocks() {
380
381	inputs := [16][]byte{}
382	for i := range inputs {
383		inputs[i] = a512srv.lanes[i].block
384	}
385
386	mask := expandMask(genMask(inputs))
387	outputs := blockAvx512(a512srv.getDigests(), inputs, mask)
388
389	a512srv.totalIn = 0
390	for i := 0; i < len(outputs); i++ {
391		uid, outputCh := a512srv.lanes[i].uid, a512srv.lanes[i].outputCh
392		a512srv.digests[uid] = outputs[i]
393		a512srv.lanes[i] = Avx512LaneInfo{}
394
395		if outputCh != nil {
396			// Send back result
397			outputCh <- outputs[i]
398			delete(a512srv.digests, uid) // Delete entry from hashmap
399		}
400	}
401}
402
403func (a512srv *Avx512Server) Write(uid uint64, p []byte) (nn int, err error) {
404	a512srv.blocksCh <- blockInput{uid: uid, msg: p}
405	return len(p), nil
406}
407
408// Sum - return sha256 sum in bytes for a given sum id.
409func (a512srv *Avx512Server) Sum(uid uint64, p []byte) [32]byte {
410	sumCh := make(chan [32]byte)
411	a512srv.blocksCh <- blockInput{uid: uid, msg: p, final: true, sumCh: sumCh}
412	return <-sumCh
413}
414
415func (a512srv *Avx512Server) getDigests() *[512]byte {
416	digests := [512]byte{}
417	for i, lane := range a512srv.lanes {
418		a, ok := a512srv.digests[lane.uid]
419		if ok {
420			binary.BigEndian.PutUint32(digests[(i+0*16)*4:], binary.LittleEndian.Uint32(a[0:4]))
421			binary.BigEndian.PutUint32(digests[(i+1*16)*4:], binary.LittleEndian.Uint32(a[4:8]))
422			binary.BigEndian.PutUint32(digests[(i+2*16)*4:], binary.LittleEndian.Uint32(a[8:12]))
423			binary.BigEndian.PutUint32(digests[(i+3*16)*4:], binary.LittleEndian.Uint32(a[12:16]))
424			binary.BigEndian.PutUint32(digests[(i+4*16)*4:], binary.LittleEndian.Uint32(a[16:20]))
425			binary.BigEndian.PutUint32(digests[(i+5*16)*4:], binary.LittleEndian.Uint32(a[20:24]))
426			binary.BigEndian.PutUint32(digests[(i+6*16)*4:], binary.LittleEndian.Uint32(a[24:28]))
427			binary.BigEndian.PutUint32(digests[(i+7*16)*4:], binary.LittleEndian.Uint32(a[28:32]))
428		} else {
429			binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0)
430			binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1)
431			binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2)
432			binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3)
433			binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4)
434			binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5)
435			binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6)
436			binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7)
437		}
438	}
439	return &digests
440}
441
442// Helper struct for sorting blocks based on length
443type lane struct {
444	len uint
445	pos uint
446}
447
448type lanes []lane
449
450func (lns lanes) Len() int           { return len(lns) }
451func (lns lanes) Swap(i, j int)      { lns[i], lns[j] = lns[j], lns[i] }
452func (lns lanes) Less(i, j int) bool { return lns[i].len < lns[j].len }
453
454// Helper struct for
455type maskRounds struct {
456	mask   uint64
457	rounds uint64
458}
459
460func genMask(input [16][]byte) [16]maskRounds {
461
462	// Sort on blocks length small to large
463	var sorted [16]lane
464	for c, inpt := range input {
465		sorted[c] = lane{uint(len(inpt)), uint(c)}
466	}
467	sort.Sort(lanes(sorted[:]))
468
469	// Create mask array including 'rounds' between masks
470	m, round, index := uint64(0xffff), uint64(0), 0
471	var mr [16]maskRounds
472	for _, s := range sorted {
473		if s.len > 0 {
474			if uint64(s.len)>>6 > round {
475				mr[index] = maskRounds{m, (uint64(s.len) >> 6) - round}
476				index++
477			}
478			round = uint64(s.len) >> 6
479		}
480		m = m & ^(1 << uint(s.pos))
481	}
482
483	return mr
484}
485
486// TODO: remove function
487func expandMask(mr [16]maskRounds) []uint64 {
488	size := uint64(0)
489	for _, r := range mr {
490		size += r.rounds
491	}
492	result, index := make([]uint64, size), 0
493	for _, r := range mr {
494		for j := uint64(0); j < r.rounds; j++ {
495			result[index] = r.mask
496			index++
497		}
498	}
499	return result
500}
501