1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The vectorized implementation found below is a derived work
6// from code written by Anton Blanchard <anton@au.ibm.com> found
7// at https://github.com/antonblanchard/crc32-vpmsum.  The original
8// is dual licensed under GPL and Apache 2.  As the copyright holder
9// for the work, IBM has contributed this new work under
10// the golang license.
11
12// Changes include porting to Go assembler with modifications for
13// the Go ABI for ppc64le.
14
15#include "textflag.h"
16
17#define POWER8_OFFSET 132
18
19#define off16	R16
20#define off32	R17
21#define off48	R18
22#define off64	R19
23#define off80	R20
24#define off96	R21
25#define	off112	R22
26
27#define const1	V24
28#define const2	V25
29
30#define byteswap	V26
31#define mask_32bit	V27
32#define mask_64bit	V28
33#define zeroes		V29
34
35#define MAX_SIZE	32*1024
36#define REFLECT
37
38TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
39	MOVWZ	crc+0(FP), R3   // incoming crc
40	MOVD    table8+8(FP), R4   // *Table
41	MOVD    p+16(FP), R5
42	MOVD    p_len+24(FP), R6 // p len
43
44	CMP     $0,R6           // len == 0?
45	BNE     start
46	MOVW    R3,ret+40(FP)   // return crc
47	RET
48
49start:
50	NOR     R3,R3,R7        // ^crc
51	MOVWZ	R7,R7		// 32 bits
52	CMP	R6,$16
53	MOVD	R6,CTR
54	BLT	short
55	SRAD    $3,R6,R8        // 8 byte chunks
56	MOVD    R8,CTR
57
58loop:
59	MOVWZ	0(R5),R8	// 0-3 bytes of p ?Endian?
60	MOVWZ	4(R5),R9	// 4-7 bytes of p
61	MOVD	R4,R10		// &tab[0]
62	XOR	R7,R8,R7	// crc ^= byte[0:3]
63	RLDICL	$40,R9,$56,R17	// p[7]
64	SLD	$2,R17,R17	// p[7]*4
65	RLDICL	$40,R7,$56,R8	// crc>>24
66	ADD	R17,R10,R17	// &tab[0][p[7]]
67	SLD	$2,R8,R8	// crc>>24*4
68	RLDICL	$48,R9,$56,R18	// p[6]
69	SLD	$2,R18,R18	// p[6]*4
70	ADD	$1024,R10,R10	// tab[1]
71	MOVWZ	0(R17),R21	// tab[0][p[7]]
72	RLDICL	$56,R9,$56,R19	// p[5]
73	ADD	R10,R18,R18	// &tab[1][p[6]]
74	SLD	$2,R19,R19	// p[5]*4:1
75	MOVWZ	0(R18),R22	// tab[1][p[6]]
76	ADD	$1024,R10,R10	// tab[2]
77	XOR	R21,R22,R21	// xor done R22
78	ADD	R19,R10,R19	// &tab[2][p[5]]
79	ANDCC	$255,R9,R20	// p[4] ??
80	SLD	$2,R20,R20	// p[4]*4
81	MOVWZ	0(R19),R23	// tab[2][p[5]]
82	ADD	$1024,R10,R10	// &tab[3]
83	ADD	R20,R10,R20	// tab[3][p[4]]
84	XOR	R21,R23,R21	// xor done R23
85	ADD	$1024,R10,R10	// &tab[4]
86	MOVWZ	0(R20),R24	// tab[3][p[4]]
87	ADD	R10,R8,R23	// &tab[4][crc>>24]
88	XOR	R21,R24,R21	// xor done R24
89	MOVWZ	0(R23),R25	// tab[4][crc>>24]
90	RLDICL	$48,R7,$56,R24	// crc>>16&0xFF
91	XOR	R21,R25,R21	// xor done R25
92	ADD	$1024,R10,R10	// &tab[5]
93	SLD	$2,R24,R24	// crc>>16&0xFF*4
94	ADD	R24,R10,R24	// &tab[5][crc>>16&0xFF]
95	MOVWZ	0(R24),R26	// tab[5][crc>>16&0xFF]
96	XOR	R21,R26,R21	// xor done R26
97	RLDICL	$56,R7,$56,R25	// crc>>8
98	ADD	$1024,R10,R10	// &tab[6]
99	SLD	$2,R25,R25	// crc>>8&FF*2
100	ADD	R25,R10,R25	// &tab[6][crc>>8&0xFF]
101	MOVBZ   R7,R26          // crc&0xFF
102	ADD     $1024,R10,R10   // &tab[7]
103	MOVWZ	0(R25),R27	// tab[6][crc>>8&0xFF]
104	SLD	$2,R26,R26	// crc&0xFF*2
105	XOR	R21,R27,R21	// xor done R27
106	ADD	R26,R10,R26	// &tab[7][crc&0xFF]
107	ADD     $8,R5           // p = p[8:]
108	MOVWZ	0(R26),R28	// tab[7][crc&0xFF]
109	XOR	R21,R28,R21	// xor done R28
110	MOVWZ	R21,R7		// crc for next round
111	BC	16,0,loop	// next 8 bytes
112	ANDCC	$7,R6,R8	// any leftover bytes
113	BEQ	done		// none --> done
114	MOVD	R8,CTR		// byte count
115        PCALIGN $16             // align short loop
116short:
117	MOVBZ   0(R5),R8        // get v
118	MOVBZ   R7,R9           // byte(crc) -> R8 BE vs LE?
119        SRD     $8,R7,R14       // crc>>8
120	XOR     R8,R9,R8        // byte(crc)^v -> R8
121	ADD	$1,R5		// ptr to next v
122	SLD     $2,R8           // convert index-> bytes
123	ADD     R8,R4,R9        // &tab[byte(crc)^v]
124	MOVWZ   0(R9),R10       // tab[byte(crc)^v]
125	XOR     R10,R14,R7       // loop crc in R7
126	BC      16,0,short
127done:
128	NOR     R7,R7,R7        // ^crc
129	MOVW    R7,ret+40(FP)   // return crc
130	RET
131
132#ifdef BYTESWAP_DATA
133DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
134DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
135
136GLOBL ·byteswapcons+0(SB),RODATA,$16
137#endif
138
139TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
140	MOVWZ	crc+0(FP), R3   // incoming crc
141	MOVWZ	ctab+4(FP), R14   // crc poly id
142	MOVD    p+8(FP), R4
143	MOVD    p_len+16(FP), R5 // p len
144
145	// R3 = incoming crc
146	// R14 = constant table identifier
147	// R5 = address of bytes
148	// R6 = length of bytes
149
150	// defines for index loads
151
152	MOVD	$16,off16
153	MOVD	$32,off32
154	MOVD	$48,off48
155	MOVD	$64,off64
156	MOVD	$80,off80
157	MOVD	$96,off96
158	MOVD	$112,off112
159	MOVD	$0,R15
160
161	MOVD	R3,R10	// save initial crc
162
163	NOR	R3,R3,R3  // ^crc
164	MOVWZ	R3,R3	// 32 bits
165	VXOR	zeroes,zeroes,zeroes  // clear the V reg
166	VSPLTISW $-1,V0
167	VSLDOI	$4,V29,V0,mask_32bit
168	VSLDOI	$8,V29,V0,mask_64bit
169
170	VXOR	V8,V8,V8
171	MTVSRD	R3,VS40	// crc initial value VS40 = V8
172
173#ifdef REFLECT
174	VSLDOI	$8,zeroes,V8,V8  // or: VSLDOI V29,V8,V27,4 for top 32 bits?
175#else
176	VSLDOI	$4,V8,zeroes,V8
177#endif
178
179#ifdef BYTESWAP_DATA
180	MOVDbyteswapcons(SB),R3
181	LVX	(R3),byteswap
182#endif
183
184	CMPU	R5,$256		// length of bytes
185	BLT	short
186
187	RLDICR	$0,R5,$56,R6 // chunk to process
188
189	// First step for larger sizes
190l1:	MOVD	$32768,R7
191	MOVD	R7,R9
192	CMP	R6,R7   // compare R6, R7 (MAX SIZE)
193	BGT	top	// less than MAX, just do remainder
194	MOVD	R6,R7
195top:
196	SUB	R7,R6,R6
197
198	// mainloop does 128 bytes at a time
199	SRD	$7,R7
200
201	// determine the offset into the constants table to start with.
202	// Each constant is 128 bytes, used against 16 bytes of data.
203	SLD	$4,R7,R8
204	SRD	$3,R9,R9
205	SUB	R8,R9,R8
206
207	// The last iteration is reduced in a separate step
208	ADD	$-1,R7
209	MOVD	R7,CTR
210
211	// Determine which constant table (depends on poly)
212	CMP	R14,$1
213	BNE	castTable
214	MOVDIEEEConst(SB),R3
215	BR	startConst
216castTable:
217	MOVDCastConst(SB),R3
218
219startConst:
220	ADD	R3,R8,R3	// starting point in constants table
221
222	VXOR	V0,V0,V0	// clear the V regs
223	VXOR	V1,V1,V1
224	VXOR	V2,V2,V2
225	VXOR	V3,V3,V3
226	VXOR	V4,V4,V4
227	VXOR	V5,V5,V5
228	VXOR	V6,V6,V6
229	VXOR	V7,V7,V7
230
231	LVX	(R3),const1	// loading constant values
232
233	CMP	R15,$1		// Identify warm up pass
234	BEQ	next
235
236	// First warm up pass: load the bytes to process
237	LVX	(R4),V16
238	LVX	(R4+off16),V17
239	LVX	(R4+off32),V18
240	LVX	(R4+off48),V19
241	LVX	(R4+off64),V20
242	LVX	(R4+off80),V21
243	LVX	(R4+off96),V22
244	LVX	(R4+off112),V23
245	ADD	$128,R4		// bump up to next 128 bytes in buffer
246
247	VXOR	V16,V8,V16	// xor in initial CRC in V8
248
249next:
250	BC	18,0,first_warm_up_done
251
252	ADD	$16,R3		// bump up to next constants
253	LVX	(R3),const2	// table values
254
255	VPMSUMD	V16,const1,V8 // second warm up pass
256	LVX	(R4),V16	// load from buffer
257	OR	$0,R2,R2
258
259	VPMSUMD	V17,const1,V9	// vpmsumd with constants
260	LVX	(R4+off16),V17	// load next from buffer
261	OR	$0,R2,R2
262
263	VPMSUMD	V18,const1,V10	// vpmsumd with constants
264	LVX	(R4+off32),V18	// load next from buffer
265	OR	$0,R2,R2
266
267	VPMSUMD	V19,const1,V11	// vpmsumd with constants
268	LVX	(R4+off48),V19	// load next from buffer
269	OR	$0,R2,R2
270
271	VPMSUMD	V20,const1,V12	// vpmsumd with constants
272	LVX	(R4+off64),V20	// load next from buffer
273	OR	$0,R2,R2
274
275	VPMSUMD	V21,const1,V13	// vpmsumd with constants
276	LVX	(R4+off80),V21	// load next from buffer
277	OR	$0,R2,R2
278
279	VPMSUMD	V22,const1,V14	// vpmsumd with constants
280	LVX	(R4+off96),V22	// load next from buffer
281	OR	$0,R2,R2
282
283	VPMSUMD	V23,const1,V15	// vpmsumd with constants
284	LVX	(R4+off112),V23	// load next from buffer
285
286	ADD	$128,R4		// bump up to next 128 bytes in buffer
287
288	BC	18,0,first_cool_down
289
290cool_top:
291	LVX	(R3),const1	// constants
292	ADD	$16,R3		// inc to next constants
293	OR	$0,R2,R2
294
295	VXOR	V0,V8,V0	// xor in previous vpmsumd
296	VPMSUMD	V16,const2,V8	// vpmsumd with constants
297	LVX	(R4),V16	// buffer
298	OR	$0,R2,R2
299
300	VXOR	V1,V9,V1	// xor in previous
301	VPMSUMD	V17,const2,V9	// vpmsumd with constants
302	LVX	(R4+off16),V17	// next in buffer
303	OR	$0,R2,R2
304
305	VXOR	V2,V10,V2	// xor in previous
306	VPMSUMD	V18,const2,V10	// vpmsumd with constants
307	LVX	(R4+off32),V18	// next in buffer
308	OR	$0,R2,R2
309
310	VXOR	V3,V11,V3	// xor in previous
311	VPMSUMD	V19,const2,V11	// vpmsumd with constants
312	LVX	(R4+off48),V19	// next in buffer
313	LVX	(R3),const2	// get next constant
314	OR	$0,R2,R2
315
316	VXOR	V4,V12,V4	// xor in previous
317	VPMSUMD	V20,const1,V12	// vpmsumd with constants
318	LVX	(R4+off64),V20	// next in buffer
319	OR	$0,R2,R2
320
321	VXOR	V5,V13,V5	// xor in previous
322	VPMSUMD	V21,const1,V13	// vpmsumd with constants
323	LVX	(R4+off80),V21	// next in buffer
324	OR	$0,R2,R2
325
326	VXOR	V6,V14,V6	// xor in previous
327	VPMSUMD	V22,const1,V14	// vpmsumd with constants
328	LVX	(R4+off96),V22	// next in buffer
329	OR	$0,R2,R2
330
331	VXOR	V7,V15,V7	// xor in previous
332	VPMSUMD	V23,const1,V15	// vpmsumd with constants
333	LVX	(R4+off112),V23	// next in buffer
334
335	ADD	$128,R4		// bump up buffer pointer
336	BC	16,0,cool_top	// are we done?
337
338first_cool_down:
339
340	// load the constants
341	// xor in the previous value
342	// vpmsumd the result with constants
343
344	LVX	(R3),const1
345	ADD	$16,R3
346
347	VXOR	V0,V8,V0
348	VPMSUMD V16,const1,V8
349	OR	$0,R2,R2
350
351	VXOR	V1,V9,V1
352	VPMSUMD	V17,const1,V9
353	OR	$0,R2,R2
354
355	VXOR	V2,V10,V2
356	VPMSUMD	V18,const1,V10
357	OR	$0,R2,R2
358
359	VXOR	V3,V11,V3
360	VPMSUMD	V19,const1,V11
361	OR	$0,R2,R2
362
363	VXOR	V4,V12,V4
364	VPMSUMD	V20,const1,V12
365	OR	$0,R2,R2
366
367	VXOR	V5,V13,V5
368	VPMSUMD	V21,const1,V13
369	OR	$0,R2,R2
370
371	VXOR	V6,V14,V6
372	VPMSUMD	V22,const1,V14
373	OR	$0,R2,R2
374
375	VXOR	V7,V15,V7
376	VPMSUMD	V23,const1,V15
377	OR	$0,R2,R2
378
379second_cool_down:
380
381	VXOR    V0,V8,V0
382	VXOR    V1,V9,V1
383	VXOR    V2,V10,V2
384	VXOR    V3,V11,V3
385	VXOR    V4,V12,V4
386	VXOR    V5,V13,V5
387	VXOR    V6,V14,V6
388	VXOR    V7,V15,V7
389
390#ifdef REFLECT
391	VSLDOI  $4,V0,zeroes,V0
392	VSLDOI  $4,V1,zeroes,V1
393	VSLDOI  $4,V2,zeroes,V2
394	VSLDOI  $4,V3,zeroes,V3
395	VSLDOI  $4,V4,zeroes,V4
396	VSLDOI  $4,V5,zeroes,V5
397	VSLDOI  $4,V6,zeroes,V6
398	VSLDOI  $4,V7,zeroes,V7
399#endif
400
401	LVX	(R4),V8
402	LVX	(R4+off16),V9
403	LVX	(R4+off32),V10
404	LVX	(R4+off48),V11
405	LVX	(R4+off64),V12
406	LVX	(R4+off80),V13
407	LVX	(R4+off96),V14
408	LVX	(R4+off112),V15
409
410	ADD	$128,R4
411
412	VXOR	V0,V8,V16
413	VXOR	V1,V9,V17
414	VXOR	V2,V10,V18
415	VXOR	V3,V11,V19
416	VXOR	V4,V12,V20
417	VXOR	V5,V13,V21
418	VXOR	V6,V14,V22
419	VXOR	V7,V15,V23
420
421	MOVD    $1,R15
422	CMP     $0,R6
423	ADD     $128,R6
424
425	BNE	l1
426	ANDCC   $127,R5
427	SUBC	R5,$128,R6
428	ADD	R3,R6,R3
429
430	SRD	$4,R5,R7
431	MOVD	R7,CTR
432	LVX	(R3),V0
433	LVX	(R3+off16),V1
434	LVX	(R3+off32),V2
435	LVX	(R3+off48),V3
436	LVX	(R3+off64),V4
437	LVX	(R3+off80),V5
438	LVX	(R3+off96),V6
439	LVX	(R3+off112),V7
440
441	ADD	$128,R3
442
443	VPMSUMW	V16,V0,V0
444	VPMSUMW	V17,V1,V1
445	VPMSUMW	V18,V2,V2
446	VPMSUMW	V19,V3,V3
447	VPMSUMW	V20,V4,V4
448	VPMSUMW	V21,V5,V5
449	VPMSUMW	V22,V6,V6
450	VPMSUMW	V23,V7,V7
451
452	// now reduce the tail
453
454	CMP	$0,R7
455	BEQ	next1
456
457	LVX	(R4),V16
458	LVX	(R3),V17
459	VPMSUMW	V16,V17,V16
460	VXOR	V0,V16,V0
461	BC	18,0,next1
462
463	LVX	(R4+off16),V16
464	LVX	(R3+off16),V17
465	VPMSUMW	V16,V17,V16
466	VXOR	V0,V16,V0
467	BC	18,0,next1
468
469	LVX	(R4+off32),V16
470	LVX	(R3+off32),V17
471	VPMSUMW	V16,V17,V16
472	VXOR	V0,V16,V0
473	BC	18,0,next1
474
475	LVX	(R4+off48),V16
476	LVX	(R3+off48),V17
477	VPMSUMW	V16,V17,V16
478	VXOR	V0,V16,V0
479	BC	18,0,next1
480
481	LVX	(R4+off64),V16
482	LVX	(R3+off64),V17
483	VPMSUMW	V16,V17,V16
484	VXOR	V0,V16,V0
485	BC	18,0,next1
486
487	LVX	(R4+off80),V16
488	LVX	(R3+off80),V17
489	VPMSUMW	V16,V17,V16
490	VXOR	V0,V16,V0
491	BC	18,0,next1
492
493	LVX	(R4+off96),V16
494	LVX	(R3+off96),V17
495	VPMSUMW	V16,V17,V16
496	VXOR	V0,V16,V0
497
498next1:
499	VXOR	V0,V1,V0
500	VXOR	V2,V3,V2
501	VXOR	V4,V5,V4
502	VXOR	V6,V7,V6
503	VXOR	V0,V2,V0
504	VXOR	V4,V6,V4
505	VXOR	V0,V4,V0
506
507barrett_reduction:
508
509	CMP	R14,$1
510	BNE	barcstTable
511	MOVDIEEEBarConst(SB),R3
512	BR	startbarConst
513barcstTable:
514	MOVDCastBarConst(SB),R3
515
516startbarConst:
517	LVX	(R3),const1
518	LVX	(R3+off16),const2
519
520	VSLDOI	$8,V0,V0,V1
521	VXOR	V0,V1,V0
522
523#ifdef REFLECT
524	VSPLTISB $1,V1
525	VSL	V0,V1,V0
526#endif
527
528	VAND	V0,mask_64bit,V0
529
530#ifndef	REFLECT
531
532	VPMSUMD	V0,const1,V1
533	VSLDOI	$8,zeroes,V1,V1
534	VPMSUMD	V1,const2,V1
535	VXOR	V0,V1,V0
536	VSLDOI	$8,V0,zeroes,V0
537
538#else
539
540	VAND	V0,mask_32bit,V1
541	VPMSUMD	V1,const1,V1
542	VAND	V1,mask_32bit,V1
543	VPMSUMD	V1,const2,V1
544	VXOR	V0,V1,V0
545	VSLDOI  $4,V0,zeroes,V0
546
547#endif
548
549	MFVSRD	VS32,R3 // VS32 = V0
550
551	NOR	R3,R3,R3 // return ^crc
552	MOVW	R3,ret+32(FP)
553	RET
554
555first_warm_up_done:
556
557	LVX	(R3),const1
558	ADD	$16,R3
559
560	VPMSUMD	V16,const1,V8
561	VPMSUMD	V17,const1,V9
562	VPMSUMD	V18,const1,V10
563	VPMSUMD	V19,const1,V11
564	VPMSUMD	V20,const1,V12
565	VPMSUMD	V21,const1,V13
566	VPMSUMD	V22,const1,V14
567	VPMSUMD	V23,const1,V15
568
569	BR	second_cool_down
570
571short:
572	CMP	$0,R5
573	BEQ	zero
574
575	// compute short constants
576
577	CMP     R14,$1
578	BNE     castshTable
579	MOVDIEEEConst(SB),R3
580	ADD	$4080,R3
581	BR      startshConst
582castshTable:
583	MOVDCastConst(SB),R3
584	ADD	$4080,R3
585
586startshConst:
587	SUBC	R5,$256,R6	// sub from 256
588	ADD	R3,R6,R3
589
590	// calculate where to start
591
592	SRD	$4,R5,R7
593	MOVD	R7,CTR
594
595	VXOR	V19,V19,V19
596	VXOR	V20,V20,V20
597
598	LVX	(R4),V0
599	LVX	(R3),V16
600	VXOR	V0,V8,V0
601	VPMSUMW	V0,V16,V0
602	BC	18,0,v0
603
604	LVX	(R4+off16),V1
605	LVX	(R3+off16),V17
606	VPMSUMW	V1,V17,V1
607	BC	18,0,v1
608
609	LVX	(R4+off32),V2
610	LVX	(R3+off32),V16
611	VPMSUMW	V2,V16,V2
612	BC	18,0,v2
613
614	LVX	(R4+off48),V3
615	LVX	(R3+off48),V17
616	VPMSUMW	V3,V17,V3
617	BC	18,0,v3
618
619	LVX	(R4+off64),V4
620	LVX	(R3+off64),V16
621	VPMSUMW	V4,V16,V4
622	BC	18,0,v4
623
624	LVX	(R4+off80),V5
625	LVX	(R3+off80),V17
626	VPMSUMW	V5,V17,V5
627	BC	18,0,v5
628
629	LVX	(R4+off96),V6
630	LVX	(R3+off96),V16
631	VPMSUMW	V6,V16,V6
632	BC	18,0,v6
633
634	LVX	(R4+off112),V7
635	LVX	(R3+off112),V17
636	VPMSUMW	V7,V17,V7
637	BC	18,0,v7
638
639	ADD	$128,R3
640	ADD	$128,R4
641
642	LVX	(R4),V8
643	LVX	(R3),V16
644	VPMSUMW	V8,V16,V8
645	BC	18,0,v8
646
647	LVX	(R4+off16),V9
648	LVX	(R3+off16),V17
649	VPMSUMW	V9,V17,V9
650	BC	18,0,v9
651
652	LVX	(R4+off32),V10
653	LVX	(R3+off32),V16
654	VPMSUMW	V10,V16,V10
655	BC	18,0,v10
656
657	LVX	(R4+off48),V11
658	LVX	(R3+off48),V17
659	VPMSUMW	V11,V17,V11
660	BC	18,0,v11
661
662	LVX	(R4+off64),V12
663	LVX	(R3+off64),V16
664	VPMSUMW	V12,V16,V12
665	BC	18,0,v12
666
667	LVX	(R4+off80),V13
668	LVX	(R3+off80),V17
669	VPMSUMW	V13,V17,V13
670	BC	18,0,v13
671
672	LVX	(R4+off96),V14
673	LVX	(R3+off96),V16
674	VPMSUMW	V14,V16,V14
675	BC	18,0,v14
676
677	LVX	(R4+off112),V15
678	LVX	(R3+off112),V17
679	VPMSUMW	V15,V17,V15
680
681	VXOR	V19,V15,V19
682v14:	VXOR	V20,V14,V20
683v13:	VXOR	V19,V13,V19
684v12:	VXOR	V20,V12,V20
685v11:	VXOR	V19,V11,V19
686v10:	VXOR	V20,V10,V20
687v9:	VXOR	V19,V9,V19
688v8:	VXOR	V20,V8,V20
689v7:	VXOR	V19,V7,V19
690v6:	VXOR	V20,V6,V20
691v5:	VXOR	V19,V5,V19
692v4:	VXOR	V20,V4,V20
693v3:	VXOR	V19,V3,V19
694v2:	VXOR	V20,V2,V20
695v1:	VXOR	V19,V1,V19
696v0:	VXOR	V20,V0,V20
697
698	VXOR	V19,V20,V0
699
700	BR	barrett_reduction
701
702zero:
703	// This case is the original crc, so just return it
704	MOVW    R10,ret+32(FP)
705	RET
706