1//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2//  Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
3//  Copyright (c) 2017 International Business Machines Corp.
4//  All rights reserved.
5//  This source code is licensed under both the GPLv2 (found in the
6//  COPYING file in the root directory) and Apache 2.0 License
7//  (found in the LICENSE.Apache file in the root directory).
8
9#include <ppc-asm.h>
10#include "ppc-opcode.h"
11
12#undef toc
13
14#ifndef r1
15#define r1 1
16#endif
17
18#ifndef r2
19#define r2 2
20#endif
21
22	.section	.rodata
23.balign 16
24
25.byteswap_constant:
26	/* byte reverse permute constant */
27	.octa 0x0F0E0D0C0B0A09080706050403020100
28
29#define __ASSEMBLY__
30#include "crc32c_ppc_constants.h"
31
32	.text
33
34#if defined(__BIG_ENDIAN__) && defined(REFLECT)
35#define BYTESWAP_DATA
36#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
37#define BYTESWAP_DATA
38#else
39#undef BYTESWAP_DATA
40#endif
41
42#define off16		r25
43#define off32		r26
44#define off48		r27
45#define off64		r28
46#define off80		r29
47#define off96		r30
48#define off112		r31
49
50#define const1		v24
51#define const2		v25
52
53#define byteswap	v26
54#define	mask_32bit	v27
55#define	mask_64bit	v28
56#define zeroes		v29
57
58#ifdef BYTESWAP_DATA
59#define VPERM(A, B, C, D) vperm	A, B, C, D
60#else
61#define VPERM(A, B, C, D)
62#endif
63
64/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
65FUNC_START(__crc32_vpmsum)
66	std	r31,-8(r1)
67	std	r30,-16(r1)
68	std	r29,-24(r1)
69	std	r28,-32(r1)
70	std	r27,-40(r1)
71	std	r26,-48(r1)
72	std	r25,-56(r1)
73
74	li	off16,16
75	li	off32,32
76	li	off48,48
77	li	off64,64
78	li	off80,80
79	li	off96,96
80	li	off112,112
81	li	r0,0
82
83	/* Enough room for saving 10 non volatile VMX registers */
84	subi	r6,r1,56+10*16
85	subi	r7,r1,56+2*16
86
87	stvx	v20,0,r6
88	stvx	v21,off16,r6
89	stvx	v22,off32,r6
90	stvx	v23,off48,r6
91	stvx	v24,off64,r6
92	stvx	v25,off80,r6
93	stvx	v26,off96,r6
94	stvx	v27,off112,r6
95	stvx	v28,0,r7
96	stvx	v29,off16,r7
97
98	mr	r10,r3
99
100	vxor	zeroes,zeroes,zeroes
101	vspltisw v0,-1
102
103	vsldoi	mask_32bit,zeroes,v0,4
104	vsldoi	mask_64bit,zeroes,v0,8
105
106	/* Get the initial value into v8 */
107	vxor	v8,v8,v8
108	MTVRD(v8, r3)
109#ifdef REFLECT
110	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
111#else
112	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
113#endif
114
115#ifdef BYTESWAP_DATA
116	addis	r3,r2,.byteswap_constant@toc@ha
117	addi	r3,r3,.byteswap_constant@toc@l
118
119	lvx	byteswap,0,r3
120	addi	r3,r3,16
121#endif
122
123	cmpdi	r5,256
124	blt	.Lshort
125
126	rldicr	r6,r5,0,56
127
128	/* Checksum in blocks of MAX_SIZE */
1291:	lis	r7,MAX_SIZE@h
130	ori	r7,r7,MAX_SIZE@l
131	mr	r9,r7
132	cmpd	r6,r7
133	bgt	2f
134	mr	r7,r6
1352:	subf	r6,r7,r6
136
137	/* our main loop does 128 bytes at a time */
138	srdi	r7,r7,7
139
140	/*
141	 * Work out the offset into the constants table to start at. Each
142	 * constant is 16 bytes, and it is used against 128 bytes of input
143	 * data - 128 / 16 = 8
144	 */
145	sldi	r8,r7,4
146	srdi	r9,r9,3
147	subf	r8,r8,r9
148
149	/* We reduce our final 128 bytes in a separate step */
150	addi	r7,r7,-1
151	mtctr	r7
152
153	addis	r3,r2,.constants@toc@ha
154	addi	r3,r3,.constants@toc@l
155
156	/* Find the start of our constants */
157	add	r3,r3,r8
158
159	/* zero v0-v7 which will contain our checksums */
160	vxor	v0,v0,v0
161	vxor	v1,v1,v1
162	vxor	v2,v2,v2
163	vxor	v3,v3,v3
164	vxor	v4,v4,v4
165	vxor	v5,v5,v5
166	vxor	v6,v6,v6
167	vxor	v7,v7,v7
168
169	lvx	const1,0,r3
170
171	/*
172	 * If we are looping back to consume more data we use the values
173	 * already in v16-v23.
174	 */
175	cmpdi	r0,1
176	beq	2f
177
178	/* First warm up pass */
179	lvx	v16,0,r4
180	lvx	v17,off16,r4
181	VPERM(v16,v16,v16,byteswap)
182	VPERM(v17,v17,v17,byteswap)
183	lvx	v18,off32,r4
184	lvx	v19,off48,r4
185	VPERM(v18,v18,v18,byteswap)
186	VPERM(v19,v19,v19,byteswap)
187	lvx	v20,off64,r4
188	lvx	v21,off80,r4
189	VPERM(v20,v20,v20,byteswap)
190	VPERM(v21,v21,v21,byteswap)
191	lvx	v22,off96,r4
192	lvx	v23,off112,r4
193	VPERM(v22,v22,v22,byteswap)
194	VPERM(v23,v23,v23,byteswap)
195	addi	r4,r4,8*16
196
197	/* xor in initial value */
198	vxor	v16,v16,v8
199
2002:	bdz	.Lfirst_warm_up_done
201
202	addi	r3,r3,16
203	lvx	const2,0,r3
204
205	/* Second warm up pass */
206	VPMSUMD(v8,v16,const1)
207	lvx	v16,0,r4
208	VPERM(v16,v16,v16,byteswap)
209	ori	r2,r2,0
210
211	VPMSUMD(v9,v17,const1)
212	lvx	v17,off16,r4
213	VPERM(v17,v17,v17,byteswap)
214	ori	r2,r2,0
215
216	VPMSUMD(v10,v18,const1)
217	lvx	v18,off32,r4
218	VPERM(v18,v18,v18,byteswap)
219	ori	r2,r2,0
220
221	VPMSUMD(v11,v19,const1)
222	lvx	v19,off48,r4
223	VPERM(v19,v19,v19,byteswap)
224	ori	r2,r2,0
225
226	VPMSUMD(v12,v20,const1)
227	lvx	v20,off64,r4
228	VPERM(v20,v20,v20,byteswap)
229	ori	r2,r2,0
230
231	VPMSUMD(v13,v21,const1)
232	lvx	v21,off80,r4
233	VPERM(v21,v21,v21,byteswap)
234	ori	r2,r2,0
235
236	VPMSUMD(v14,v22,const1)
237	lvx	v22,off96,r4
238	VPERM(v22,v22,v22,byteswap)
239	ori	r2,r2,0
240
241	VPMSUMD(v15,v23,const1)
242	lvx	v23,off112,r4
243	VPERM(v23,v23,v23,byteswap)
244
245	addi	r4,r4,8*16
246
247	bdz	.Lfirst_cool_down
248
249	/*
250	 * main loop. We modulo schedule it such that it takes three iterations
251	 * to complete - first iteration load, second iteration vpmsum, third
252	 * iteration xor.
253	 */
254	.balign	16
2554:	lvx	const1,0,r3
256	addi	r3,r3,16
257	ori	r2,r2,0
258
259	vxor	v0,v0,v8
260	VPMSUMD(v8,v16,const2)
261	lvx	v16,0,r4
262	VPERM(v16,v16,v16,byteswap)
263	ori	r2,r2,0
264
265	vxor	v1,v1,v9
266	VPMSUMD(v9,v17,const2)
267	lvx	v17,off16,r4
268	VPERM(v17,v17,v17,byteswap)
269	ori	r2,r2,0
270
271	vxor	v2,v2,v10
272	VPMSUMD(v10,v18,const2)
273	lvx	v18,off32,r4
274	VPERM(v18,v18,v18,byteswap)
275	ori	r2,r2,0
276
277	vxor	v3,v3,v11
278	VPMSUMD(v11,v19,const2)
279	lvx	v19,off48,r4
280	VPERM(v19,v19,v19,byteswap)
281	lvx	const2,0,r3
282	ori	r2,r2,0
283
284	vxor	v4,v4,v12
285	VPMSUMD(v12,v20,const1)
286	lvx	v20,off64,r4
287	VPERM(v20,v20,v20,byteswap)
288	ori	r2,r2,0
289
290	vxor	v5,v5,v13
291	VPMSUMD(v13,v21,const1)
292	lvx	v21,off80,r4
293	VPERM(v21,v21,v21,byteswap)
294	ori	r2,r2,0
295
296	vxor	v6,v6,v14
297	VPMSUMD(v14,v22,const1)
298	lvx	v22,off96,r4
299	VPERM(v22,v22,v22,byteswap)
300	ori	r2,r2,0
301
302	vxor	v7,v7,v15
303	VPMSUMD(v15,v23,const1)
304	lvx	v23,off112,r4
305	VPERM(v23,v23,v23,byteswap)
306
307	addi	r4,r4,8*16
308
309	bdnz	4b
310
311.Lfirst_cool_down:
312	/* First cool down pass */
313	lvx	const1,0,r3
314	addi	r3,r3,16
315
316	vxor	v0,v0,v8
317	VPMSUMD(v8,v16,const1)
318	ori	r2,r2,0
319
320	vxor	v1,v1,v9
321	VPMSUMD(v9,v17,const1)
322	ori	r2,r2,0
323
324	vxor	v2,v2,v10
325	VPMSUMD(v10,v18,const1)
326	ori	r2,r2,0
327
328	vxor	v3,v3,v11
329	VPMSUMD(v11,v19,const1)
330	ori	r2,r2,0
331
332	vxor	v4,v4,v12
333	VPMSUMD(v12,v20,const1)
334	ori	r2,r2,0
335
336	vxor	v5,v5,v13
337	VPMSUMD(v13,v21,const1)
338	ori	r2,r2,0
339
340	vxor	v6,v6,v14
341	VPMSUMD(v14,v22,const1)
342	ori	r2,r2,0
343
344	vxor	v7,v7,v15
345	VPMSUMD(v15,v23,const1)
346	ori	r2,r2,0
347
348.Lsecond_cool_down:
349	/* Second cool down pass */
350	vxor	v0,v0,v8
351	vxor	v1,v1,v9
352	vxor	v2,v2,v10
353	vxor	v3,v3,v11
354	vxor	v4,v4,v12
355	vxor	v5,v5,v13
356	vxor	v6,v6,v14
357	vxor	v7,v7,v15
358
359#ifdef REFLECT
360	/*
361	 * vpmsumd produces a 96 bit result in the least significant bits
362	 * of the register. Since we are bit reflected we have to shift it
363	 * left 32 bits so it occupies the least significant bits in the
364	 * bit reflected domain.
365	 */
366	vsldoi	v0,v0,zeroes,4
367	vsldoi	v1,v1,zeroes,4
368	vsldoi	v2,v2,zeroes,4
369	vsldoi	v3,v3,zeroes,4
370	vsldoi	v4,v4,zeroes,4
371	vsldoi	v5,v5,zeroes,4
372	vsldoi	v6,v6,zeroes,4
373	vsldoi	v7,v7,zeroes,4
374#endif
375
376	/* xor with last 1024 bits */
377	lvx	v8,0,r4
378	lvx	v9,off16,r4
379	VPERM(v8,v8,v8,byteswap)
380	VPERM(v9,v9,v9,byteswap)
381	lvx	v10,off32,r4
382	lvx	v11,off48,r4
383	VPERM(v10,v10,v10,byteswap)
384	VPERM(v11,v11,v11,byteswap)
385	lvx	v12,off64,r4
386	lvx	v13,off80,r4
387	VPERM(v12,v12,v12,byteswap)
388	VPERM(v13,v13,v13,byteswap)
389	lvx	v14,off96,r4
390	lvx	v15,off112,r4
391	VPERM(v14,v14,v14,byteswap)
392	VPERM(v15,v15,v15,byteswap)
393
394	addi	r4,r4,8*16
395
396	vxor	v16,v0,v8
397	vxor	v17,v1,v9
398	vxor	v18,v2,v10
399	vxor	v19,v3,v11
400	vxor	v20,v4,v12
401	vxor	v21,v5,v13
402	vxor	v22,v6,v14
403	vxor	v23,v7,v15
404
405	li	r0,1
406	cmpdi	r6,0
407	addi	r6,r6,128
408	bne	1b
409
410	/* Work out how many bytes we have left */
411	andi.	r5,r5,127
412
413	/* Calculate where in the constant table we need to start */
414	subfic	r6,r5,128
415	add	r3,r3,r6
416
417	/* How many 16 byte chunks are in the tail */
418	srdi	r7,r5,4
419	mtctr	r7
420
421	/*
422	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
423	 * 32 bits to include the trailing 32 bits of zeros
424	 */
425	lvx	v0,0,r3
426	lvx	v1,off16,r3
427	lvx	v2,off32,r3
428	lvx	v3,off48,r3
429	lvx	v4,off64,r3
430	lvx	v5,off80,r3
431	lvx	v6,off96,r3
432	lvx	v7,off112,r3
433	addi	r3,r3,8*16
434
435	VPMSUMW(v0,v16,v0)
436	VPMSUMW(v1,v17,v1)
437	VPMSUMW(v2,v18,v2)
438	VPMSUMW(v3,v19,v3)
439	VPMSUMW(v4,v20,v4)
440	VPMSUMW(v5,v21,v5)
441	VPMSUMW(v6,v22,v6)
442	VPMSUMW(v7,v23,v7)
443
444	/* Now reduce the tail (0 - 112 bytes) */
445	cmpdi	r7,0
446	beq	1f
447
448	lvx	v16,0,r4
449	lvx	v17,0,r3
450	VPERM(v16,v16,v16,byteswap)
451	VPMSUMW(v16,v16,v17)
452	vxor	v0,v0,v16
453	bdz	1f
454
455	lvx	v16,off16,r4
456	lvx	v17,off16,r3
457	VPERM(v16,v16,v16,byteswap)
458	VPMSUMW(v16,v16,v17)
459	vxor	v0,v0,v16
460	bdz	1f
461
462	lvx	v16,off32,r4
463	lvx	v17,off32,r3
464	VPERM(v16,v16,v16,byteswap)
465	VPMSUMW(v16,v16,v17)
466	vxor	v0,v0,v16
467	bdz	1f
468
469	lvx	v16,off48,r4
470	lvx	v17,off48,r3
471	VPERM(v16,v16,v16,byteswap)
472	VPMSUMW(v16,v16,v17)
473	vxor	v0,v0,v16
474	bdz	1f
475
476	lvx	v16,off64,r4
477	lvx	v17,off64,r3
478	VPERM(v16,v16,v16,byteswap)
479	VPMSUMW(v16,v16,v17)
480	vxor	v0,v0,v16
481	bdz	1f
482
483	lvx	v16,off80,r4
484	lvx	v17,off80,r3
485	VPERM(v16,v16,v16,byteswap)
486	VPMSUMW(v16,v16,v17)
487	vxor	v0,v0,v16
488	bdz	1f
489
490	lvx	v16,off96,r4
491	lvx	v17,off96,r3
492	VPERM(v16,v16,v16,byteswap)
493	VPMSUMW(v16,v16,v17)
494	vxor	v0,v0,v16
495
496	/* Now xor all the parallel chunks together */
4971:	vxor	v0,v0,v1
498	vxor	v2,v2,v3
499	vxor	v4,v4,v5
500	vxor	v6,v6,v7
501
502	vxor	v0,v0,v2
503	vxor	v4,v4,v6
504
505	vxor	v0,v0,v4
506
507.Lbarrett_reduction:
508	/* Barrett constants */
509	addis	r3,r2,.barrett_constants@toc@ha
510	addi	r3,r3,.barrett_constants@toc@l
511
512	lvx	const1,0,r3
513	lvx	const2,off16,r3
514
515	vsldoi	v1,v0,v0,8
516	vxor	v0,v0,v1		/* xor two 64 bit results together */
517
518#ifdef REFLECT
519	/* shift left one bit */
520	vspltisb v1,1
521	vsl	v0,v0,v1
522#endif
523
524	vand	v0,v0,mask_64bit
525
526#ifndef REFLECT
527	/*
528	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
529	 * the multiple of our polynomial that we need to subtract. By
530	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
531	 * result back down 2x bits, we round down to the nearest multiple.
532	 */
533	VPMSUMD(v1,v0,const1)	/* ma */
534	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
535	VPMSUMD(v1,v1,const2)	/* qn */
536	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
537
538	/*
539	 * Get the result into r3. We need to shift it left 8 bytes:
540	 * V0 [ 0 1 2 X ]
541	 * V0 [ 0 X 2 3 ]
542	 */
543	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
544#else
545	/*
546	 * The reflected version of Barrett reduction. Instead of bit
547	 * reflecting our data (which is expensive to do), we bit reflect our
548	 * constants and our algorithm, which means the intermediate data in
549	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
550	 * the algorithm because we don't carry in mod 2 arithmetic.
551	 */
552	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
553	VPMSUMD(v1,v1,const1)		/* ma */
554	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
555	VPMSUMD(v1,v1,const2)		/* qn */
556	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
557
558	/*
559	 * Since we are bit reflected, the result (ie the low 32 bits) is in
560	 * the high 32 bits. We just need to shift it left 4 bytes
561	 * V0 [ 0 1 X 3 ]
562	 * V0 [ 0 X 2 3 ]
563	 */
564	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
565#endif
566
567	/* Get it into r3 */
568	MFVRD(r3, v0)
569
570.Lout:
571	subi	r6,r1,56+10*16
572	subi	r7,r1,56+2*16
573
574	lvx	v20,0,r6
575	lvx	v21,off16,r6
576	lvx	v22,off32,r6
577	lvx	v23,off48,r6
578	lvx	v24,off64,r6
579	lvx	v25,off80,r6
580	lvx	v26,off96,r6
581	lvx	v27,off112,r6
582	lvx	v28,0,r7
583	lvx	v29,off16,r7
584
585	ld	r31,-8(r1)
586	ld	r30,-16(r1)
587	ld	r29,-24(r1)
588	ld	r28,-32(r1)
589	ld	r27,-40(r1)
590	ld	r26,-48(r1)
591	ld	r25,-56(r1)
592
593	blr
594
595.Lfirst_warm_up_done:
596	lvx	const1,0,r3
597	addi	r3,r3,16
598
599	VPMSUMD(v8,v16,const1)
600	VPMSUMD(v9,v17,const1)
601	VPMSUMD(v10,v18,const1)
602	VPMSUMD(v11,v19,const1)
603	VPMSUMD(v12,v20,const1)
604	VPMSUMD(v13,v21,const1)
605	VPMSUMD(v14,v22,const1)
606	VPMSUMD(v15,v23,const1)
607
608	b	.Lsecond_cool_down
609
610.Lshort:
611	cmpdi	r5,0
612	beq	.Lzero
613
614	addis	r3,r2,.short_constants@toc@ha
615	addi	r3,r3,.short_constants@toc@l
616
617	/* Calculate where in the constant table we need to start */
618	subfic	r6,r5,256
619	add	r3,r3,r6
620
621	/* How many 16 byte chunks? */
622	srdi	r7,r5,4
623	mtctr	r7
624
625	vxor	v19,v19,v19
626	vxor	v20,v20,v20
627
628	lvx	v0,0,r4
629	lvx	v16,0,r3
630	VPERM(v0,v0,v16,byteswap)
631	vxor	v0,v0,v8	/* xor in initial value */
632	VPMSUMW(v0,v0,v16)
633	bdz	.Lv0
634
635	lvx	v1,off16,r4
636	lvx	v17,off16,r3
637	VPERM(v1,v1,v17,byteswap)
638	VPMSUMW(v1,v1,v17)
639	bdz	.Lv1
640
641	lvx	v2,off32,r4
642	lvx	v16,off32,r3
643	VPERM(v2,v2,v16,byteswap)
644	VPMSUMW(v2,v2,v16)
645	bdz	.Lv2
646
647	lvx	v3,off48,r4
648	lvx	v17,off48,r3
649	VPERM(v3,v3,v17,byteswap)
650	VPMSUMW(v3,v3,v17)
651	bdz	.Lv3
652
653	lvx	v4,off64,r4
654	lvx	v16,off64,r3
655	VPERM(v4,v4,v16,byteswap)
656	VPMSUMW(v4,v4,v16)
657	bdz	.Lv4
658
659	lvx	v5,off80,r4
660	lvx	v17,off80,r3
661	VPERM(v5,v5,v17,byteswap)
662	VPMSUMW(v5,v5,v17)
663	bdz	.Lv5
664
665	lvx	v6,off96,r4
666	lvx	v16,off96,r3
667	VPERM(v6,v6,v16,byteswap)
668	VPMSUMW(v6,v6,v16)
669	bdz	.Lv6
670
671	lvx	v7,off112,r4
672	lvx	v17,off112,r3
673	VPERM(v7,v7,v17,byteswap)
674	VPMSUMW(v7,v7,v17)
675	bdz	.Lv7
676
677	addi	r3,r3,128
678	addi	r4,r4,128
679
680	lvx	v8,0,r4
681	lvx	v16,0,r3
682	VPERM(v8,v8,v16,byteswap)
683	VPMSUMW(v8,v8,v16)
684	bdz	.Lv8
685
686	lvx	v9,off16,r4
687	lvx	v17,off16,r3
688	VPERM(v9,v9,v17,byteswap)
689	VPMSUMW(v9,v9,v17)
690	bdz	.Lv9
691
692	lvx	v10,off32,r4
693	lvx	v16,off32,r3
694	VPERM(v10,v10,v16,byteswap)
695	VPMSUMW(v10,v10,v16)
696	bdz	.Lv10
697
698	lvx	v11,off48,r4
699	lvx	v17,off48,r3
700	VPERM(v11,v11,v17,byteswap)
701	VPMSUMW(v11,v11,v17)
702	bdz	.Lv11
703
704	lvx	v12,off64,r4
705	lvx	v16,off64,r3
706	VPERM(v12,v12,v16,byteswap)
707	VPMSUMW(v12,v12,v16)
708	bdz	.Lv12
709
710	lvx	v13,off80,r4
711	lvx	v17,off80,r3
712	VPERM(v13,v13,v17,byteswap)
713	VPMSUMW(v13,v13,v17)
714	bdz	.Lv13
715
716	lvx	v14,off96,r4
717	lvx	v16,off96,r3
718	VPERM(v14,v14,v16,byteswap)
719	VPMSUMW(v14,v14,v16)
720	bdz	.Lv14
721
722	lvx	v15,off112,r4
723	lvx	v17,off112,r3
724	VPERM(v15,v15,v17,byteswap)
725	VPMSUMW(v15,v15,v17)
726
727.Lv15:	vxor	v19,v19,v15
728.Lv14:	vxor	v20,v20,v14
729.Lv13:	vxor	v19,v19,v13
730.Lv12:	vxor	v20,v20,v12
731.Lv11:	vxor	v19,v19,v11
732.Lv10:	vxor	v20,v20,v10
733.Lv9:	vxor	v19,v19,v9
734.Lv8:	vxor	v20,v20,v8
735.Lv7:	vxor	v19,v19,v7
736.Lv6:	vxor	v20,v20,v6
737.Lv5:	vxor	v19,v19,v5
738.Lv4:	vxor	v20,v20,v4
739.Lv3:	vxor	v19,v19,v3
740.Lv2:	vxor	v20,v20,v2
741.Lv1:	vxor	v19,v19,v1
742.Lv0:	vxor	v20,v20,v0
743
744	vxor	v0,v19,v20
745
746	b	.Lbarrett_reduction
747
748.Lzero:
749	mr	r3,r10
750	b	.Lout
751
752FUNC_END(__crc32_vpmsum)
753