1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The vectorized implementation found below is a derived work
6// from code written by Anton Blanchard <anton@au.ibm.com> found
7// at https://github.com/antonblanchard/crc32-vpmsum.  The original
8// is dual licensed under GPL and Apache 2.  As the copyright holder
9// for the work, IBM has contributed this new work under
10// the golang license.
11
12// Changes include porting to Go assembler with modifications for
13// the Go ABI for ppc64le.
14
15#include "textflag.h"
16
17#define POWER8_OFFSET 132
18
19#define off16	R16
20#define off32	R17
21#define off48	R18
22#define off64	R19
23#define off80	R20
24#define off96	R21
25#define	off112	R22
26
27#define const1	V24
28#define const2	V25
29
30#define byteswap	V26
31#define mask_32bit	V27
32#define mask_64bit	V28
33#define zeroes		V29
34
35#define MAX_SIZE	32*1024
36#define REFLECT
37
38TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
39	MOVWZ	crc+0(FP), R3   // incoming crc
40	MOVD    table8+8(FP), R4   // *Table
41	MOVD    p+16(FP), R5
42	MOVD    p_len+24(FP), R6 // p len
43
44	CMP     $0,R6           // len == 0?
45	BNE     start
46	MOVW    R3,ret+40(FP)   // return crc
47	RET
48
49start:
50	NOR     R3,R3,R7        // ^crc
51	MOVWZ	R7,R7		// 32 bits
52	CMP	R6,$16
53	MOVD	R6,CTR
54	BLT	short
55	SRAD    $3,R6,R8        // 8 byte chunks
56	MOVD    R8,CTR
57
58loop:
59	MOVWZ	0(R5),R8	// 0-3 bytes of p ?Endian?
60	MOVWZ	4(R5),R9	// 4-7 bytes of p
61	MOVD	R4,R10		// &tab[0]
62	XOR	R7,R8,R7	// crc ^= byte[0:3]
63	RLDICL	$40,R9,$56,R17	// p[7]
64	SLD	$2,R17,R17	// p[7]*4
65	RLDICL	$40,R7,$56,R8	// crc>>24
66	ADD	R17,R10,R17	// &tab[0][p[7]]
67	SLD	$2,R8,R8	// crc>>24*4
68	RLDICL	$48,R9,$56,R18	// p[6]
69	SLD	$2,R18,R18	// p[6]*4
70	ADD	$1024,R10,R10	// tab[1]
71	MOVWZ	0(R17),R21	// tab[0][p[7]]
72	RLDICL	$56,R9,$56,R19	// p[5]
73	ADD	R10,R18,R18	// &tab[1][p[6]]
74	SLD	$2,R19,R19	// p[5]*4:1
75	MOVWZ	0(R18),R22	// tab[1][p[6]]
76	ADD	$1024,R10,R10	// tab[2]
77	XOR	R21,R22,R21	// xor done R22
78	ADD	R19,R10,R19	// &tab[2][p[5]]
79	ANDCC	$255,R9,R20	// p[4] ??
80	SLD	$2,R20,R20	// p[4]*4
81	MOVWZ	0(R19),R23	// tab[2][p[5]]
82	ADD	$1024,R10,R10	// &tab[3]
83	ADD	R20,R10,R20	// tab[3][p[4]]
84	XOR	R21,R23,R21	// xor done R23
85	ADD	$1024,R10,R10	// &tab[4]
86	MOVWZ	0(R20),R24	// tab[3][p[4]]
87	ADD	R10,R8,R23	// &tab[4][crc>>24]
88	XOR	R21,R24,R21	// xor done R24
89	MOVWZ	0(R23),R25	// tab[4][crc>>24]
90	RLDICL	$48,R7,$56,R24	// crc>>16&0xFF
91	XOR	R21,R25,R21	// xor done R25
92	ADD	$1024,R10,R10	// &tab[5]
93	SLD	$2,R24,R24	// crc>>16&0xFF*4
94	ADD	R24,R10,R24	// &tab[5][crc>>16&0xFF]
95	MOVWZ	0(R24),R26	// tab[5][crc>>16&0xFF]
96	XOR	R21,R26,R21	// xor done R26
97	RLDICL	$56,R7,$56,R25	// crc>>8
98	ADD	$1024,R10,R10	// &tab[6]
99	SLD	$2,R25,R25	// crc>>8&FF*2
100	ADD	R25,R10,R25	// &tab[6][crc>>8&0xFF]
101	MOVBZ   R7,R26          // crc&0xFF
102	ADD     $1024,R10,R10   // &tab[7]
103	MOVWZ	0(R25),R27	// tab[6][crc>>8&0xFF]
104	SLD	$2,R26,R26	// crc&0xFF*2
105	XOR	R21,R27,R21	// xor done R27
106	ADD	R26,R10,R26	// &tab[7][crc&0xFF]
107	ADD     $8,R5           // p = p[8:]
108	MOVWZ	0(R26),R28	// tab[7][crc&0xFF]
109	XOR	R21,R28,R21	// xor done R28
110	MOVWZ	R21,R7		// crc for next round
111	BC	16,0,loop	// next 8 bytes
112	ANDCC	$7,R6,R8	// any leftover bytes
113	BEQ	done		// none --> done
114	MOVD	R8,CTR		// byte count
115
116short:
117	MOVBZ   0(R5),R8        // get v
118	MOVBZ   R7,R9           // byte(crc) -> R8 BE vs LE?
119	MOVWZ	R7,R14
120	SRD	$8,R14,R14	// crc>>8
121	XOR     R8,R9,R8        // byte(crc)^v -> R8
122	ADD	$1,R5		// ptr to next v
123	SLD     $2,R8           // convert index-> bytes
124	ADD     R8,R4,R9        // &tab[byte(crc)^v]
125	MOVWZ   0(R9),R10       // tab[byte(crc)^v]
126	XOR     R10,R14,R7       // loop crc in R7
127	MOVWZ   R7,R7           // 32 bits
128	BC      16,0,short
129done:
130	NOR     R7,R7,R7        // ^crc
131	MOVW    R7,ret+40(FP)   // return crc
132	RET
133
134#ifdef BYTESWAP_DATA
135DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
136DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
137
138GLOBL ·byteswapcons+0(SB),RODATA,$16
139#endif
140
141TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
142	MOVWZ	crc+0(FP), R3   // incoming crc
143	MOVWZ	ctab+4(FP), R14   // crc poly id
144	MOVD    p+8(FP), R4
145	MOVD    p_len+16(FP), R5 // p len
146
147	// R3 = incoming crc
148	// R14 = constant table identifier
149	// R5 = address of bytes
150	// R6 = length of bytes
151
152	// defines for index loads
153
154	MOVD	$16,off16
155	MOVD	$32,off32
156	MOVD	$48,off48
157	MOVD	$64,off64
158	MOVD	$80,off80
159	MOVD	$96,off96
160	MOVD	$112,off112
161	MOVD	$0,R15
162
163	MOVD	R3,R10	// save initial crc
164
165	NOR	R3,R3,R3  // ^crc
166	MOVWZ	R3,R3	// 32 bits
167	VXOR	zeroes,zeroes,zeroes  // clear the V reg
168	VSPLTISW $-1,V0
169	VSLDOI	$4,V29,V0,mask_32bit
170	VSLDOI	$8,V29,V0,mask_64bit
171
172	VXOR	V8,V8,V8
173	MTVSRD	R3,VS40	// crc initial value VS40 = V8
174
175#ifdef REFLECT
176	VSLDOI	$8,zeroes,V8,V8  // or: VSLDOI V29,V8,V27,4 for top 32 bits?
177#else
178	VSLDOI	$4,V8,zeroes,V8
179#endif
180
181#ifdef BYTESWAP_DATA
182	MOVDbyteswapcons(SB),R3
183	LVX	(R3),byteswap
184#endif
185
186	CMPU	R5,$256		// length of bytes
187	BLT	short
188
189	RLDICR	$0,R5,$56,R6 // chunk to process
190
191	// First step for larger sizes
192l1:	MOVD	$32768,R7
193	MOVD	R7,R9
194	CMP	R6,R7   // compare R6, R7 (MAX SIZE)
195	BGT	top	// less than MAX, just do remainder
196	MOVD	R6,R7
197top:
198	SUB	R7,R6,R6
199
200	// mainloop does 128 bytes at a time
201	SRD	$7,R7
202
203	// determine the offset into the constants table to start with.
204	// Each constant is 128 bytes, used against 16 bytes of data.
205	SLD	$4,R7,R8
206	SRD	$3,R9,R9
207	SUB	R8,R9,R8
208
209	// The last iteration is reduced in a separate step
210	ADD	$-1,R7
211	MOVD	R7,CTR
212
213	// Determine which constant table (depends on poly)
214	CMP	R14,$1
215	BNE	castTable
216	MOVDIEEEConst(SB),R3
217	BR	startConst
218castTable:
219	MOVDCastConst(SB),R3
220
221startConst:
222	ADD	R3,R8,R3	// starting point in constants table
223
224	VXOR	V0,V0,V0	// clear the V regs
225	VXOR	V1,V1,V1
226	VXOR	V2,V2,V2
227	VXOR	V3,V3,V3
228	VXOR	V4,V4,V4
229	VXOR	V5,V5,V5
230	VXOR	V6,V6,V6
231	VXOR	V7,V7,V7
232
233	LVX	(R3),const1	// loading constant values
234
235	CMP	R15,$1		// Identify warm up pass
236	BEQ	next
237
238	// First warm up pass: load the bytes to process
239	LVX	(R4),V16
240	LVX	(R4+off16),V17
241	LVX	(R4+off32),V18
242	LVX	(R4+off48),V19
243	LVX	(R4+off64),V20
244	LVX	(R4+off80),V21
245	LVX	(R4+off96),V22
246	LVX	(R4+off112),V23
247	ADD	$128,R4		// bump up to next 128 bytes in buffer
248
249	VXOR	V16,V8,V16	// xor in initial CRC in V8
250
251next:
252	BC	18,0,first_warm_up_done
253
254	ADD	$16,R3		// bump up to next constants
255	LVX	(R3),const2	// table values
256
257	VPMSUMD	V16,const1,V8 // second warm up pass
258	LVX	(R4),V16	// load from buffer
259	OR	$0,R2,R2
260
261	VPMSUMD	V17,const1,V9	// vpmsumd with constants
262	LVX	(R4+off16),V17	// load next from buffer
263	OR	$0,R2,R2
264
265	VPMSUMD	V18,const1,V10	// vpmsumd with constants
266	LVX	(R4+off32),V18	// load next from buffer
267	OR	$0,R2,R2
268
269	VPMSUMD	V19,const1,V11	// vpmsumd with constants
270	LVX	(R4+off48),V19	// load next from buffer
271	OR	$0,R2,R2
272
273	VPMSUMD	V20,const1,V12	// vpmsumd with constants
274	LVX	(R4+off64),V20	// load next from buffer
275	OR	$0,R2,R2
276
277	VPMSUMD	V21,const1,V13	// vpmsumd with constants
278	LVX	(R4+off80),V21	// load next from buffer
279	OR	$0,R2,R2
280
281	VPMSUMD	V22,const1,V14	// vpmsumd with constants
282	LVX	(R4+off96),V22	// load next from buffer
283	OR	$0,R2,R2
284
285	VPMSUMD	V23,const1,V15	// vpmsumd with constants
286	LVX	(R4+off112),V23	// load next from buffer
287
288	ADD	$128,R4		// bump up to next 128 bytes in buffer
289
290	BC	18,0,first_cool_down
291
292cool_top:
293	LVX	(R3),const1	// constants
294	ADD	$16,R3		// inc to next constants
295	OR	$0,R2,R2
296
297	VXOR	V0,V8,V0	// xor in previous vpmsumd
298	VPMSUMD	V16,const2,V8	// vpmsumd with constants
299	LVX	(R4),V16	// buffer
300	OR	$0,R2,R2
301
302	VXOR	V1,V9,V1	// xor in previous
303	VPMSUMD	V17,const2,V9	// vpmsumd with constants
304	LVX	(R4+off16),V17	// next in buffer
305	OR	$0,R2,R2
306
307	VXOR	V2,V10,V2	// xor in previous
308	VPMSUMD	V18,const2,V10	// vpmsumd with constants
309	LVX	(R4+off32),V18	// next in buffer
310	OR	$0,R2,R2
311
312	VXOR	V3,V11,V3	// xor in previous
313	VPMSUMD	V19,const2,V11	// vpmsumd with constants
314	LVX	(R4+off48),V19	// next in buffer
315	LVX	(R3),const2	// get next constant
316	OR	$0,R2,R2
317
318	VXOR	V4,V12,V4	// xor in previous
319	VPMSUMD	V20,const1,V12	// vpmsumd with constants
320	LVX	(R4+off64),V20	// next in buffer
321	OR	$0,R2,R2
322
323	VXOR	V5,V13,V5	// xor in previous
324	VPMSUMD	V21,const1,V13	// vpmsumd with constants
325	LVX	(R4+off80),V21	// next in buffer
326	OR	$0,R2,R2
327
328	VXOR	V6,V14,V6	// xor in previous
329	VPMSUMD	V22,const1,V14	// vpmsumd with constants
330	LVX	(R4+off96),V22	// next in buffer
331	OR	$0,R2,R2
332
333	VXOR	V7,V15,V7	// xor in previous
334	VPMSUMD	V23,const1,V15	// vpmsumd with constants
335	LVX	(R4+off112),V23	// next in buffer
336
337	ADD	$128,R4		// bump up buffer pointer
338	BC	16,0,cool_top	// are we done?
339
340first_cool_down:
341
342	// load the constants
343	// xor in the previous value
344	// vpmsumd the result with constants
345
346	LVX	(R3),const1
347	ADD	$16,R3
348
349	VXOR	V0,V8,V0
350	VPMSUMD V16,const1,V8
351	OR	$0,R2,R2
352
353	VXOR	V1,V9,V1
354	VPMSUMD	V17,const1,V9
355	OR	$0,R2,R2
356
357	VXOR	V2,V10,V2
358	VPMSUMD	V18,const1,V10
359	OR	$0,R2,R2
360
361	VXOR	V3,V11,V3
362	VPMSUMD	V19,const1,V11
363	OR	$0,R2,R2
364
365	VXOR	V4,V12,V4
366	VPMSUMD	V20,const1,V12
367	OR	$0,R2,R2
368
369	VXOR	V5,V13,V5
370	VPMSUMD	V21,const1,V13
371	OR	$0,R2,R2
372
373	VXOR	V6,V14,V6
374	VPMSUMD	V22,const1,V14
375	OR	$0,R2,R2
376
377	VXOR	V7,V15,V7
378	VPMSUMD	V23,const1,V15
379	OR	$0,R2,R2
380
381second_cool_down:
382
383	VXOR    V0,V8,V0
384	VXOR    V1,V9,V1
385	VXOR    V2,V10,V2
386	VXOR    V3,V11,V3
387	VXOR    V4,V12,V4
388	VXOR    V5,V13,V5
389	VXOR    V6,V14,V6
390	VXOR    V7,V15,V7
391
392#ifdef REFLECT
393	VSLDOI  $4,V0,zeroes,V0
394	VSLDOI  $4,V1,zeroes,V1
395	VSLDOI  $4,V2,zeroes,V2
396	VSLDOI  $4,V3,zeroes,V3
397	VSLDOI  $4,V4,zeroes,V4
398	VSLDOI  $4,V5,zeroes,V5
399	VSLDOI  $4,V6,zeroes,V6
400	VSLDOI  $4,V7,zeroes,V7
401#endif
402
403	LVX	(R4),V8
404	LVX	(R4+off16),V9
405	LVX	(R4+off32),V10
406	LVX	(R4+off48),V11
407	LVX	(R4+off64),V12
408	LVX	(R4+off80),V13
409	LVX	(R4+off96),V14
410	LVX	(R4+off112),V15
411
412	ADD	$128,R4
413
414	VXOR	V0,V8,V16
415	VXOR	V1,V9,V17
416	VXOR	V2,V10,V18
417	VXOR	V3,V11,V19
418	VXOR	V4,V12,V20
419	VXOR	V5,V13,V21
420	VXOR	V6,V14,V22
421	VXOR	V7,V15,V23
422
423	MOVD    $1,R15
424	CMP     $0,R6
425	ADD     $128,R6
426
427	BNE	l1
428	ANDCC   $127,R5
429	SUBC	R5,$128,R6
430	ADD	R3,R6,R3
431
432	SRD	$4,R5,R7
433	MOVD	R7,CTR
434	LVX	(R3),V0
435	LVX	(R3+off16),V1
436	LVX	(R3+off32),V2
437	LVX	(R3+off48),V3
438	LVX	(R3+off64),V4
439	LVX	(R3+off80),V5
440	LVX	(R3+off96),V6
441	LVX	(R3+off112),V7
442
443	ADD	$128,R3
444
445	VPMSUMW	V16,V0,V0
446	VPMSUMW	V17,V1,V1
447	VPMSUMW	V18,V2,V2
448	VPMSUMW	V19,V3,V3
449	VPMSUMW	V20,V4,V4
450	VPMSUMW	V21,V5,V5
451	VPMSUMW	V22,V6,V6
452	VPMSUMW	V23,V7,V7
453
454	// now reduce the tail
455
456	CMP	$0,R7
457	BEQ	next1
458
459	LVX	(R4),V16
460	LVX	(R3),V17
461	VPMSUMW	V16,V17,V16
462	VXOR	V0,V16,V0
463	BC	18,0,next1
464
465	LVX	(R4+off16),V16
466	LVX	(R3+off16),V17
467	VPMSUMW	V16,V17,V16
468	VXOR	V0,V16,V0
469	BC	18,0,next1
470
471	LVX	(R4+off32),V16
472	LVX	(R3+off32),V17
473	VPMSUMW	V16,V17,V16
474	VXOR	V0,V16,V0
475	BC	18,0,next1
476
477	LVX	(R4+off48),V16
478	LVX	(R3+off48),V17
479	VPMSUMW	V16,V17,V16
480	VXOR	V0,V16,V0
481	BC	18,0,next1
482
483	LVX	(R4+off64),V16
484	LVX	(R3+off64),V17
485	VPMSUMW	V16,V17,V16
486	VXOR	V0,V16,V0
487	BC	18,0,next1
488
489	LVX	(R4+off80),V16
490	LVX	(R3+off80),V17
491	VPMSUMW	V16,V17,V16
492	VXOR	V0,V16,V0
493	BC	18,0,next1
494
495	LVX	(R4+off96),V16
496	LVX	(R3+off96),V17
497	VPMSUMW	V16,V17,V16
498	VXOR	V0,V16,V0
499
500next1:
501	VXOR	V0,V1,V0
502	VXOR	V2,V3,V2
503	VXOR	V4,V5,V4
504	VXOR	V6,V7,V6
505	VXOR	V0,V2,V0
506	VXOR	V4,V6,V4
507	VXOR	V0,V4,V0
508
509barrett_reduction:
510
511	CMP	R14,$1
512	BNE	barcstTable
513	MOVDIEEEBarConst(SB),R3
514	BR	startbarConst
515barcstTable:
516	MOVDCastBarConst(SB),R3
517
518startbarConst:
519	LVX	(R3),const1
520	LVX	(R3+off16),const2
521
522	VSLDOI	$8,V0,V0,V1
523	VXOR	V0,V1,V0
524
525#ifdef REFLECT
526	VSPLTISB $1,V1
527	VSL	V0,V1,V0
528#endif
529
530	VAND	V0,mask_64bit,V0
531
532#ifndef	REFLECT
533
534	VPMSUMD	V0,const1,V1
535	VSLDOI	$8,zeroes,V1,V1
536	VPMSUMD	V1,const2,V1
537	VXOR	V0,V1,V0
538	VSLDOI	$8,V0,zeroes,V0
539
540#else
541
542	VAND	V0,mask_32bit,V1
543	VPMSUMD	V1,const1,V1
544	VAND	V1,mask_32bit,V1
545	VPMSUMD	V1,const2,V1
546	VXOR	V0,V1,V0
547	VSLDOI  $4,V0,zeroes,V0
548
549#endif
550
551	MFVSRD	VS32,R3 // VS32 = V0
552
553	NOR	R3,R3,R3 // return ^crc
554	MOVW	R3,ret+32(FP)
555	RET
556
557first_warm_up_done:
558
559	LVX	(R3),const1
560	ADD	$16,R3
561
562	VPMSUMD	V16,const1,V8
563	VPMSUMD	V17,const1,V9
564	VPMSUMD	V18,const1,V10
565	VPMSUMD	V19,const1,V11
566	VPMSUMD	V20,const1,V12
567	VPMSUMD	V21,const1,V13
568	VPMSUMD	V22,const1,V14
569	VPMSUMD	V23,const1,V15
570
571	BR	second_cool_down
572
573short:
574	CMP	$0,R5
575	BEQ	zero
576
577	// compute short constants
578
579	CMP     R14,$1
580	BNE     castshTable
581	MOVDIEEEConst(SB),R3
582	ADD	$4080,R3
583	BR      startshConst
584castshTable:
585	MOVDCastConst(SB),R3
586	ADD	$4080,R3
587
588startshConst:
589	SUBC	R5,$256,R6	// sub from 256
590	ADD	R3,R6,R3
591
592	// calculate where to start
593
594	SRD	$4,R5,R7
595	MOVD	R7,CTR
596
597	VXOR	V19,V19,V19
598	VXOR	V20,V20,V20
599
600	LVX	(R4),V0
601	LVX	(R3),V16
602	VXOR	V0,V8,V0
603	VPMSUMW	V0,V16,V0
604	BC	18,0,v0
605
606	LVX	(R4+off16),V1
607	LVX	(R3+off16),V17
608	VPMSUMW	V1,V17,V1
609	BC	18,0,v1
610
611	LVX	(R4+off32),V2
612	LVX	(R3+off32),V16
613	VPMSUMW	V2,V16,V2
614	BC	18,0,v2
615
616	LVX	(R4+off48),V3
617	LVX	(R3+off48),V17
618	VPMSUMW	V3,V17,V3
619	BC	18,0,v3
620
621	LVX	(R4+off64),V4
622	LVX	(R3+off64),V16
623	VPMSUMW	V4,V16,V4
624	BC	18,0,v4
625
626	LVX	(R4+off80),V5
627	LVX	(R3+off80),V17
628	VPMSUMW	V5,V17,V5
629	BC	18,0,v5
630
631	LVX	(R4+off96),V6
632	LVX	(R3+off96),V16
633	VPMSUMW	V6,V16,V6
634	BC	18,0,v6
635
636	LVX	(R4+off112),V7
637	LVX	(R3+off112),V17
638	VPMSUMW	V7,V17,V7
639	BC	18,0,v7
640
641	ADD	$128,R3
642	ADD	$128,R4
643
644	LVX	(R4),V8
645	LVX	(R3),V16
646	VPMSUMW	V8,V16,V8
647	BC	18,0,v8
648
649	LVX	(R4+off16),V9
650	LVX	(R3+off16),V17
651	VPMSUMW	V9,V17,V9
652	BC	18,0,v9
653
654	LVX	(R4+off32),V10
655	LVX	(R3+off32),V16
656	VPMSUMW	V10,V16,V10
657	BC	18,0,v10
658
659	LVX	(R4+off48),V11
660	LVX	(R3+off48),V17
661	VPMSUMW	V11,V17,V11
662	BC	18,0,v11
663
664	LVX	(R4+off64),V12
665	LVX	(R3+off64),V16
666	VPMSUMW	V12,V16,V12
667	BC	18,0,v12
668
669	LVX	(R4+off80),V13
670	LVX	(R3+off80),V17
671	VPMSUMW	V13,V17,V13
672	BC	18,0,v13
673
674	LVX	(R4+off96),V14
675	LVX	(R3+off96),V16
676	VPMSUMW	V14,V16,V14
677	BC	18,0,v14
678
679	LVX	(R4+off112),V15
680	LVX	(R3+off112),V17
681	VPMSUMW	V15,V17,V15
682
683	VXOR	V19,V15,V19
684v14:	VXOR	V20,V14,V20
685v13:	VXOR	V19,V13,V19
686v12:	VXOR	V20,V12,V20
687v11:	VXOR	V19,V11,V19
688v10:	VXOR	V20,V10,V20
689v9:	VXOR	V19,V9,V19
690v8:	VXOR	V20,V8,V20
691v7:	VXOR	V19,V7,V19
692v6:	VXOR	V20,V6,V20
693v5:	VXOR	V19,V5,V19
694v4:	VXOR	V20,V4,V20
695v3:	VXOR	V19,V3,V19
696v2:	VXOR	V20,V2,V20
697v1:	VXOR	V19,V1,V19
698v0:	VXOR	V20,V0,V20
699
700	VXOR	V19,V20,V0
701
702	BR	barrett_reduction
703
704zero:
705	// This case is the original crc, so just return it
706	MOVW    R10,ret+32(FP)
707	RET
708