1//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2//  Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
3//  Copyright (c) 2017 International Business Machines Corp.
4//  All rights reserved.
5//  This source code is licensed under both the GPLv2 (found in the
6//  COPYING file in the root directory) and Apache 2.0 License
7//  (found in the LICENSE.Apache file in the root directory).
8
9#if defined (__clang__)
10#include "third-party/gcc/ppc-asm.h"
11#else
12#include <ppc-asm.h>
13#endif
14#include "ppc-opcode.h"
15
16#undef toc
17
18#ifndef r1
19#define r1 1
20#endif
21
22#ifndef r2
23#define r2 2
24#endif
25
26	.section	.rodata
27.balign 16
28
29.byteswap_constant:
30	/* byte reverse permute constant */
31	.octa 0x0F0E0D0C0B0A09080706050403020100
32
33#define __ASSEMBLY__
34#include "crc32c_ppc_constants.h"
35
36	.text
37
38#if defined(__BIG_ENDIAN__) && defined(REFLECT)
39#define BYTESWAP_DATA
40#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
41#define BYTESWAP_DATA
42#else
43#undef BYTESWAP_DATA
44#endif
45
46#define off16		r25
47#define off32		r26
48#define off48		r27
49#define off64		r28
50#define off80		r29
51#define off96		r30
52#define off112		r31
53
54#define const1		v24
55#define const2		v25
56
57#define byteswap	v26
58#define	mask_32bit	v27
59#define	mask_64bit	v28
60#define zeroes		v29
61
62#ifdef BYTESWAP_DATA
63#define VPERM(A, B, C, D) vperm	A, B, C, D
64#else
65#define VPERM(A, B, C, D)
66#endif
67
68/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
69FUNC_START(__crc32_vpmsum)
70	std	r31,-8(r1)
71	std	r30,-16(r1)
72	std	r29,-24(r1)
73	std	r28,-32(r1)
74	std	r27,-40(r1)
75	std	r26,-48(r1)
76	std	r25,-56(r1)
77
78	li	off16,16
79	li	off32,32
80	li	off48,48
81	li	off64,64
82	li	off80,80
83	li	off96,96
84	li	off112,112
85	li	r0,0
86
87	/* Enough room for saving 10 non volatile VMX registers */
88	subi	r6,r1,56+10*16
89	subi	r7,r1,56+2*16
90
91	stvx	v20,0,r6
92	stvx	v21,off16,r6
93	stvx	v22,off32,r6
94	stvx	v23,off48,r6
95	stvx	v24,off64,r6
96	stvx	v25,off80,r6
97	stvx	v26,off96,r6
98	stvx	v27,off112,r6
99	stvx	v28,0,r7
100	stvx	v29,off16,r7
101
102	mr	r10,r3
103
104	vxor	zeroes,zeroes,zeroes
105	vspltisw v0,-1
106
107	vsldoi	mask_32bit,zeroes,v0,4
108	vsldoi	mask_64bit,zeroes,v0,8
109
110	/* Get the initial value into v8 */
111	vxor	v8,v8,v8
112	MTVRD(v8, r3)
113#ifdef REFLECT
114	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
115#else
116	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
117#endif
118
119#ifdef BYTESWAP_DATA
120	addis	r3,r2,.byteswap_constant@toc@ha
121	addi	r3,r3,.byteswap_constant@toc@l
122
123	lvx	byteswap,0,r3
124	addi	r3,r3,16
125#endif
126
127	cmpdi	r5,256
128	blt	.Lshort
129
130	rldicr	r6,r5,0,56
131
132	/* Checksum in blocks of MAX_SIZE */
1331:	lis	r7,MAX_SIZE@h
134	ori	r7,r7,MAX_SIZE@l
135	mr	r9,r7
136	cmpd	r6,r7
137	bgt	2f
138	mr	r7,r6
1392:	subf	r6,r7,r6
140
141	/* our main loop does 128 bytes at a time */
142	srdi	r7,r7,7
143
144	/*
145	 * Work out the offset into the constants table to start at. Each
146	 * constant is 16 bytes, and it is used against 128 bytes of input
147	 * data - 128 / 16 = 8
148	 */
149	sldi	r8,r7,4
150	srdi	r9,r9,3
151	subf	r8,r8,r9
152
153	/* We reduce our final 128 bytes in a separate step */
154	addi	r7,r7,-1
155	mtctr	r7
156
157	addis	r3,r2,.constants@toc@ha
158	addi	r3,r3,.constants@toc@l
159
160	/* Find the start of our constants */
161	add	r3,r3,r8
162
163	/* zero v0-v7 which will contain our checksums */
164	vxor	v0,v0,v0
165	vxor	v1,v1,v1
166	vxor	v2,v2,v2
167	vxor	v3,v3,v3
168	vxor	v4,v4,v4
169	vxor	v5,v5,v5
170	vxor	v6,v6,v6
171	vxor	v7,v7,v7
172
173	lvx	const1,0,r3
174
175	/*
176	 * If we are looping back to consume more data we use the values
177	 * already in v16-v23.
178	 */
179	cmpdi	r0,1
180	beq	2f
181
182	/* First warm up pass */
183	lvx	v16,0,r4
184	lvx	v17,off16,r4
185	VPERM(v16,v16,v16,byteswap)
186	VPERM(v17,v17,v17,byteswap)
187	lvx	v18,off32,r4
188	lvx	v19,off48,r4
189	VPERM(v18,v18,v18,byteswap)
190	VPERM(v19,v19,v19,byteswap)
191	lvx	v20,off64,r4
192	lvx	v21,off80,r4
193	VPERM(v20,v20,v20,byteswap)
194	VPERM(v21,v21,v21,byteswap)
195	lvx	v22,off96,r4
196	lvx	v23,off112,r4
197	VPERM(v22,v22,v22,byteswap)
198	VPERM(v23,v23,v23,byteswap)
199	addi	r4,r4,8*16
200
201	/* xor in initial value */
202	vxor	v16,v16,v8
203
2042:	bdz	.Lfirst_warm_up_done
205
206	addi	r3,r3,16
207	lvx	const2,0,r3
208
209	/* Second warm up pass */
210	VPMSUMD(v8,v16,const1)
211	lvx	v16,0,r4
212	VPERM(v16,v16,v16,byteswap)
213	ori	r2,r2,0
214
215	VPMSUMD(v9,v17,const1)
216	lvx	v17,off16,r4
217	VPERM(v17,v17,v17,byteswap)
218	ori	r2,r2,0
219
220	VPMSUMD(v10,v18,const1)
221	lvx	v18,off32,r4
222	VPERM(v18,v18,v18,byteswap)
223	ori	r2,r2,0
224
225	VPMSUMD(v11,v19,const1)
226	lvx	v19,off48,r4
227	VPERM(v19,v19,v19,byteswap)
228	ori	r2,r2,0
229
230	VPMSUMD(v12,v20,const1)
231	lvx	v20,off64,r4
232	VPERM(v20,v20,v20,byteswap)
233	ori	r2,r2,0
234
235	VPMSUMD(v13,v21,const1)
236	lvx	v21,off80,r4
237	VPERM(v21,v21,v21,byteswap)
238	ori	r2,r2,0
239
240	VPMSUMD(v14,v22,const1)
241	lvx	v22,off96,r4
242	VPERM(v22,v22,v22,byteswap)
243	ori	r2,r2,0
244
245	VPMSUMD(v15,v23,const1)
246	lvx	v23,off112,r4
247	VPERM(v23,v23,v23,byteswap)
248
249	addi	r4,r4,8*16
250
251	bdz	.Lfirst_cool_down
252
253	/*
254	 * main loop. We modulo schedule it such that it takes three iterations
255	 * to complete - first iteration load, second iteration vpmsum, third
256	 * iteration xor.
257	 */
258	.balign	16
2594:	lvx	const1,0,r3
260	addi	r3,r3,16
261	ori	r2,r2,0
262
263	vxor	v0,v0,v8
264	VPMSUMD(v8,v16,const2)
265	lvx	v16,0,r4
266	VPERM(v16,v16,v16,byteswap)
267	ori	r2,r2,0
268
269	vxor	v1,v1,v9
270	VPMSUMD(v9,v17,const2)
271	lvx	v17,off16,r4
272	VPERM(v17,v17,v17,byteswap)
273	ori	r2,r2,0
274
275	vxor	v2,v2,v10
276	VPMSUMD(v10,v18,const2)
277	lvx	v18,off32,r4
278	VPERM(v18,v18,v18,byteswap)
279	ori	r2,r2,0
280
281	vxor	v3,v3,v11
282	VPMSUMD(v11,v19,const2)
283	lvx	v19,off48,r4
284	VPERM(v19,v19,v19,byteswap)
285	lvx	const2,0,r3
286	ori	r2,r2,0
287
288	vxor	v4,v4,v12
289	VPMSUMD(v12,v20,const1)
290	lvx	v20,off64,r4
291	VPERM(v20,v20,v20,byteswap)
292	ori	r2,r2,0
293
294	vxor	v5,v5,v13
295	VPMSUMD(v13,v21,const1)
296	lvx	v21,off80,r4
297	VPERM(v21,v21,v21,byteswap)
298	ori	r2,r2,0
299
300	vxor	v6,v6,v14
301	VPMSUMD(v14,v22,const1)
302	lvx	v22,off96,r4
303	VPERM(v22,v22,v22,byteswap)
304	ori	r2,r2,0
305
306	vxor	v7,v7,v15
307	VPMSUMD(v15,v23,const1)
308	lvx	v23,off112,r4
309	VPERM(v23,v23,v23,byteswap)
310
311	addi	r4,r4,8*16
312
313	bdnz	4b
314
315.Lfirst_cool_down:
316	/* First cool down pass */
317	lvx	const1,0,r3
318	addi	r3,r3,16
319
320	vxor	v0,v0,v8
321	VPMSUMD(v8,v16,const1)
322	ori	r2,r2,0
323
324	vxor	v1,v1,v9
325	VPMSUMD(v9,v17,const1)
326	ori	r2,r2,0
327
328	vxor	v2,v2,v10
329	VPMSUMD(v10,v18,const1)
330	ori	r2,r2,0
331
332	vxor	v3,v3,v11
333	VPMSUMD(v11,v19,const1)
334	ori	r2,r2,0
335
336	vxor	v4,v4,v12
337	VPMSUMD(v12,v20,const1)
338	ori	r2,r2,0
339
340	vxor	v5,v5,v13
341	VPMSUMD(v13,v21,const1)
342	ori	r2,r2,0
343
344	vxor	v6,v6,v14
345	VPMSUMD(v14,v22,const1)
346	ori	r2,r2,0
347
348	vxor	v7,v7,v15
349	VPMSUMD(v15,v23,const1)
350	ori	r2,r2,0
351
352.Lsecond_cool_down:
353	/* Second cool down pass */
354	vxor	v0,v0,v8
355	vxor	v1,v1,v9
356	vxor	v2,v2,v10
357	vxor	v3,v3,v11
358	vxor	v4,v4,v12
359	vxor	v5,v5,v13
360	vxor	v6,v6,v14
361	vxor	v7,v7,v15
362
363#ifdef REFLECT
364	/*
365	 * vpmsumd produces a 96 bit result in the least significant bits
366	 * of the register. Since we are bit reflected we have to shift it
367	 * left 32 bits so it occupies the least significant bits in the
368	 * bit reflected domain.
369	 */
370	vsldoi	v0,v0,zeroes,4
371	vsldoi	v1,v1,zeroes,4
372	vsldoi	v2,v2,zeroes,4
373	vsldoi	v3,v3,zeroes,4
374	vsldoi	v4,v4,zeroes,4
375	vsldoi	v5,v5,zeroes,4
376	vsldoi	v6,v6,zeroes,4
377	vsldoi	v7,v7,zeroes,4
378#endif
379
380	/* xor with last 1024 bits */
381	lvx	v8,0,r4
382	lvx	v9,off16,r4
383	VPERM(v8,v8,v8,byteswap)
384	VPERM(v9,v9,v9,byteswap)
385	lvx	v10,off32,r4
386	lvx	v11,off48,r4
387	VPERM(v10,v10,v10,byteswap)
388	VPERM(v11,v11,v11,byteswap)
389	lvx	v12,off64,r4
390	lvx	v13,off80,r4
391	VPERM(v12,v12,v12,byteswap)
392	VPERM(v13,v13,v13,byteswap)
393	lvx	v14,off96,r4
394	lvx	v15,off112,r4
395	VPERM(v14,v14,v14,byteswap)
396	VPERM(v15,v15,v15,byteswap)
397
398	addi	r4,r4,8*16
399
400	vxor	v16,v0,v8
401	vxor	v17,v1,v9
402	vxor	v18,v2,v10
403	vxor	v19,v3,v11
404	vxor	v20,v4,v12
405	vxor	v21,v5,v13
406	vxor	v22,v6,v14
407	vxor	v23,v7,v15
408
409	li	r0,1
410	cmpdi	r6,0
411	addi	r6,r6,128
412	bne	1b
413
414	/* Work out how many bytes we have left */
415	andi.	r5,r5,127
416
417	/* Calculate where in the constant table we need to start */
418	subfic	r6,r5,128
419	add	r3,r3,r6
420
421	/* How many 16 byte chunks are in the tail */
422	srdi	r7,r5,4
423	mtctr	r7
424
425	/*
426	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
427	 * 32 bits to include the trailing 32 bits of zeros
428	 */
429	lvx	v0,0,r3
430	lvx	v1,off16,r3
431	lvx	v2,off32,r3
432	lvx	v3,off48,r3
433	lvx	v4,off64,r3
434	lvx	v5,off80,r3
435	lvx	v6,off96,r3
436	lvx	v7,off112,r3
437	addi	r3,r3,8*16
438
439	VPMSUMW(v0,v16,v0)
440	VPMSUMW(v1,v17,v1)
441	VPMSUMW(v2,v18,v2)
442	VPMSUMW(v3,v19,v3)
443	VPMSUMW(v4,v20,v4)
444	VPMSUMW(v5,v21,v5)
445	VPMSUMW(v6,v22,v6)
446	VPMSUMW(v7,v23,v7)
447
448	/* Now reduce the tail (0 - 112 bytes) */
449	cmpdi	r7,0
450	beq	1f
451
452	lvx	v16,0,r4
453	lvx	v17,0,r3
454	VPERM(v16,v16,v16,byteswap)
455	VPMSUMW(v16,v16,v17)
456	vxor	v0,v0,v16
457	bdz	1f
458
459	lvx	v16,off16,r4
460	lvx	v17,off16,r3
461	VPERM(v16,v16,v16,byteswap)
462	VPMSUMW(v16,v16,v17)
463	vxor	v0,v0,v16
464	bdz	1f
465
466	lvx	v16,off32,r4
467	lvx	v17,off32,r3
468	VPERM(v16,v16,v16,byteswap)
469	VPMSUMW(v16,v16,v17)
470	vxor	v0,v0,v16
471	bdz	1f
472
473	lvx	v16,off48,r4
474	lvx	v17,off48,r3
475	VPERM(v16,v16,v16,byteswap)
476	VPMSUMW(v16,v16,v17)
477	vxor	v0,v0,v16
478	bdz	1f
479
480	lvx	v16,off64,r4
481	lvx	v17,off64,r3
482	VPERM(v16,v16,v16,byteswap)
483	VPMSUMW(v16,v16,v17)
484	vxor	v0,v0,v16
485	bdz	1f
486
487	lvx	v16,off80,r4
488	lvx	v17,off80,r3
489	VPERM(v16,v16,v16,byteswap)
490	VPMSUMW(v16,v16,v17)
491	vxor	v0,v0,v16
492	bdz	1f
493
494	lvx	v16,off96,r4
495	lvx	v17,off96,r3
496	VPERM(v16,v16,v16,byteswap)
497	VPMSUMW(v16,v16,v17)
498	vxor	v0,v0,v16
499
500	/* Now xor all the parallel chunks together */
5011:	vxor	v0,v0,v1
502	vxor	v2,v2,v3
503	vxor	v4,v4,v5
504	vxor	v6,v6,v7
505
506	vxor	v0,v0,v2
507	vxor	v4,v4,v6
508
509	vxor	v0,v0,v4
510
511.Lbarrett_reduction:
512	/* Barrett constants */
513	addis	r3,r2,.barrett_constants@toc@ha
514	addi	r3,r3,.barrett_constants@toc@l
515
516	lvx	const1,0,r3
517	lvx	const2,off16,r3
518
519	vsldoi	v1,v0,v0,8
520	vxor	v0,v0,v1		/* xor two 64 bit results together */
521
522#ifdef REFLECT
523	/* shift left one bit */
524	vspltisb v1,1
525	vsl	v0,v0,v1
526#endif
527
528	vand	v0,v0,mask_64bit
529
530#ifndef REFLECT
531	/*
532	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
533	 * the multiple of our polynomial that we need to subtract. By
534	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
535	 * result back down 2x bits, we round down to the nearest multiple.
536	 */
537	VPMSUMD(v1,v0,const1)	/* ma */
538	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
539	VPMSUMD(v1,v1,const2)	/* qn */
540	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
541
542	/*
543	 * Get the result into r3. We need to shift it left 8 bytes:
544	 * V0 [ 0 1 2 X ]
545	 * V0 [ 0 X 2 3 ]
546	 */
547	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
548#else
549	/*
550	 * The reflected version of Barrett reduction. Instead of bit
551	 * reflecting our data (which is expensive to do), we bit reflect our
552	 * constants and our algorithm, which means the intermediate data in
553	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
554	 * the algorithm because we don't carry in mod 2 arithmetic.
555	 */
556	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
557	VPMSUMD(v1,v1,const1)		/* ma */
558	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
559	VPMSUMD(v1,v1,const2)		/* qn */
560	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
561
562	/*
563	 * Since we are bit reflected, the result (ie the low 32 bits) is in
564	 * the high 32 bits. We just need to shift it left 4 bytes
565	 * V0 [ 0 1 X 3 ]
566	 * V0 [ 0 X 2 3 ]
567	 */
568	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
569#endif
570
571	/* Get it into r3 */
572	MFVRD(r3, v0)
573
574.Lout:
575	subi	r6,r1,56+10*16
576	subi	r7,r1,56+2*16
577
578	lvx	v20,0,r6
579	lvx	v21,off16,r6
580	lvx	v22,off32,r6
581	lvx	v23,off48,r6
582	lvx	v24,off64,r6
583	lvx	v25,off80,r6
584	lvx	v26,off96,r6
585	lvx	v27,off112,r6
586	lvx	v28,0,r7
587	lvx	v29,off16,r7
588
589	ld	r31,-8(r1)
590	ld	r30,-16(r1)
591	ld	r29,-24(r1)
592	ld	r28,-32(r1)
593	ld	r27,-40(r1)
594	ld	r26,-48(r1)
595	ld	r25,-56(r1)
596
597	blr
598
599.Lfirst_warm_up_done:
600	lvx	const1,0,r3
601	addi	r3,r3,16
602
603	VPMSUMD(v8,v16,const1)
604	VPMSUMD(v9,v17,const1)
605	VPMSUMD(v10,v18,const1)
606	VPMSUMD(v11,v19,const1)
607	VPMSUMD(v12,v20,const1)
608	VPMSUMD(v13,v21,const1)
609	VPMSUMD(v14,v22,const1)
610	VPMSUMD(v15,v23,const1)
611
612	b	.Lsecond_cool_down
613
614.Lshort:
615	cmpdi	r5,0
616	beq	.Lzero
617
618	addis	r3,r2,.short_constants@toc@ha
619	addi	r3,r3,.short_constants@toc@l
620
621	/* Calculate where in the constant table we need to start */
622	subfic	r6,r5,256
623	add	r3,r3,r6
624
625	/* How many 16 byte chunks? */
626	srdi	r7,r5,4
627	mtctr	r7
628
629	vxor	v19,v19,v19
630	vxor	v20,v20,v20
631
632	lvx	v0,0,r4
633	lvx	v16,0,r3
634	VPERM(v0,v0,v16,byteswap)
635	vxor	v0,v0,v8	/* xor in initial value */
636	VPMSUMW(v0,v0,v16)
637	bdz	.Lv0
638
639	lvx	v1,off16,r4
640	lvx	v17,off16,r3
641	VPERM(v1,v1,v17,byteswap)
642	VPMSUMW(v1,v1,v17)
643	bdz	.Lv1
644
645	lvx	v2,off32,r4
646	lvx	v16,off32,r3
647	VPERM(v2,v2,v16,byteswap)
648	VPMSUMW(v2,v2,v16)
649	bdz	.Lv2
650
651	lvx	v3,off48,r4
652	lvx	v17,off48,r3
653	VPERM(v3,v3,v17,byteswap)
654	VPMSUMW(v3,v3,v17)
655	bdz	.Lv3
656
657	lvx	v4,off64,r4
658	lvx	v16,off64,r3
659	VPERM(v4,v4,v16,byteswap)
660	VPMSUMW(v4,v4,v16)
661	bdz	.Lv4
662
663	lvx	v5,off80,r4
664	lvx	v17,off80,r3
665	VPERM(v5,v5,v17,byteswap)
666	VPMSUMW(v5,v5,v17)
667	bdz	.Lv5
668
669	lvx	v6,off96,r4
670	lvx	v16,off96,r3
671	VPERM(v6,v6,v16,byteswap)
672	VPMSUMW(v6,v6,v16)
673	bdz	.Lv6
674
675	lvx	v7,off112,r4
676	lvx	v17,off112,r3
677	VPERM(v7,v7,v17,byteswap)
678	VPMSUMW(v7,v7,v17)
679	bdz	.Lv7
680
681	addi	r3,r3,128
682	addi	r4,r4,128
683
684	lvx	v8,0,r4
685	lvx	v16,0,r3
686	VPERM(v8,v8,v16,byteswap)
687	VPMSUMW(v8,v8,v16)
688	bdz	.Lv8
689
690	lvx	v9,off16,r4
691	lvx	v17,off16,r3
692	VPERM(v9,v9,v17,byteswap)
693	VPMSUMW(v9,v9,v17)
694	bdz	.Lv9
695
696	lvx	v10,off32,r4
697	lvx	v16,off32,r3
698	VPERM(v10,v10,v16,byteswap)
699	VPMSUMW(v10,v10,v16)
700	bdz	.Lv10
701
702	lvx	v11,off48,r4
703	lvx	v17,off48,r3
704	VPERM(v11,v11,v17,byteswap)
705	VPMSUMW(v11,v11,v17)
706	bdz	.Lv11
707
708	lvx	v12,off64,r4
709	lvx	v16,off64,r3
710	VPERM(v12,v12,v16,byteswap)
711	VPMSUMW(v12,v12,v16)
712	bdz	.Lv12
713
714	lvx	v13,off80,r4
715	lvx	v17,off80,r3
716	VPERM(v13,v13,v17,byteswap)
717	VPMSUMW(v13,v13,v17)
718	bdz	.Lv13
719
720	lvx	v14,off96,r4
721	lvx	v16,off96,r3
722	VPERM(v14,v14,v16,byteswap)
723	VPMSUMW(v14,v14,v16)
724	bdz	.Lv14
725
726	lvx	v15,off112,r4
727	lvx	v17,off112,r3
728	VPERM(v15,v15,v17,byteswap)
729	VPMSUMW(v15,v15,v17)
730
731.Lv15:	vxor	v19,v19,v15
732.Lv14:	vxor	v20,v20,v14
733.Lv13:	vxor	v19,v19,v13
734.Lv12:	vxor	v20,v20,v12
735.Lv11:	vxor	v19,v19,v11
736.Lv10:	vxor	v20,v20,v10
737.Lv9:	vxor	v19,v19,v9
738.Lv8:	vxor	v20,v20,v8
739.Lv7:	vxor	v19,v19,v7
740.Lv6:	vxor	v20,v20,v6
741.Lv5:	vxor	v19,v19,v5
742.Lv4:	vxor	v20,v20,v4
743.Lv3:	vxor	v19,v19,v3
744.Lv2:	vxor	v20,v20,v2
745.Lv1:	vxor	v19,v19,v1
746.Lv0:	vxor	v20,v20,v0
747
748	vxor	v0,v19,v20
749
750	b	.Lbarrett_reduction
751
752.Lzero:
753	mr	r3,r10
754	b	.Lout
755
756FUNC_END(__crc32_vpmsum)
757